diff --git a/.github/ISSUE_TEMPLATE/custom.md b/.github/ISSUE_TEMPLATE/bug.md
similarity index 96%
rename from .github/ISSUE_TEMPLATE/custom.md
rename to .github/ISSUE_TEMPLATE/bug.md
index 8fd955356..c003fe7c1 100644
--- a/.github/ISSUE_TEMPLATE/custom.md
+++ b/.github/ISSUE_TEMPLATE/bug.md
@@ -1,8 +1,7 @@
 ---
-name: Issue and enhancement template
-about: Used to report issues and request enhancements for llama.cpp
-title: "[User] Insert summary of your issue or enhancement.."
-labels: ''
+name: Bug template
+about: Used to report bugs in llama.cpp
+labels: ["bug-unconfirmed"]
 assignees: ''
 
 ---
@@ -46,7 +45,7 @@ $ g++ --version
 
 # Failure Information (for bugs)
 
-Please help provide information about the failure if this is a bug. If it is not a bug, please remove the rest of this template.
+Please help provide information about the failure / bug.
 
 # Steps to Reproduce
 
diff --git a/.github/ISSUE_TEMPLATE/enhancement.md b/.github/ISSUE_TEMPLATE/enhancement.md
new file mode 100644
index 000000000..dcffda750
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/enhancement.md
@@ -0,0 +1,28 @@
+---
+name: Enhancement template
+about: Used to request enhancements for llama.cpp
+labels: ["enhancement"]
+assignees: ''
+
+---
+
+# Prerequisites
+
+Please answer the following questions for yourself before submitting an issue.
+
+- [ ] I am running the latest code. Development is very rapid so there are no tagged versions as of now.
+- [ ] I carefully followed the [README.md](https://github.com/ggerganov/llama.cpp/blob/master/README.md).
+- [ ] I [searched using keywords relevant to my issue](https://docs.github.com/en/issues/tracking-your-work-with-issues/filtering-and-searching-issues-and-pull-requests) to make sure that I am creating a new issue that is not already open (or closed).
+- [ ] I reviewed the [Discussions](https://github.com/ggerganov/llama.cpp/discussions), and have a new bug or useful enhancement to share.
+
+# Feature Description
+
+Please provide a detailed written description of what you were trying to do, and what you expected `llama.cpp` to do as an enhancement.
+
+# Motivation
+
+Please provide a detailed written description of reasons why this feature is necessary and how it is useful to `llama.cpp` users.
+
+# Possible Implementation
+
+If you have an idea as to how it can be implemented, please write a detailed description. Feel free to give links to external sources or share visuals that might be helpful to understand the details better.
diff --git a/.gitignore b/.gitignore
index d288e66fc..50cbd0b47 100644
--- a/.gitignore
+++ b/.gitignore
@@ -10,10 +10,12 @@
 *.gcno
 *.gcda
 *.dot
+*.bat
 *.metallib
 .DS_Store
 .build/
 .cache/
+.ccls-cache/
 .direnv/
 .envrc
 .swiftpm
@@ -44,6 +46,7 @@ models-mnt
 /infill
 /libllama.so
 /llama-bench
+/llava
 /main
 /metal
 /perplexity
@@ -62,7 +65,7 @@ models-mnt
 /parallel
 /train-text-from-scratch
 /vdot
-build-info.h
+/common/build-info.cpp
 arm_neon.h
 compile_commands.json
 CMakeSettings.json
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 9184eda8f..3c49d645c 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -44,7 +44,7 @@ endif()
 
 # general
 option(LLAMA_STATIC                     "llama: static link libraries"                          OFF)
-option(LLAMA_NATIVE                     "llama: enable -march=native flag"                      ON)
+option(LLAMA_NATIVE                     "llama: enable -march=native flag"                      OFF)
 option(LLAMA_LTO                        "llama: enable link time optimization"                  OFF)
 
 # debug
@@ -82,6 +82,7 @@ set(LLAMA_BLAS_VENDOR "Generic" CACHE STRING "llama: BLAS library vendor")
 option(LLAMA_CUBLAS                          "llama: use CUDA"                                  OFF)
 #option(LLAMA_CUDA_CUBLAS                     "llama: use cuBLAS for prompt processing"          OFF)
 option(LLAMA_CUDA_FORCE_DMMV                 "llama: use dmmv instead of mmvq CUDA kernels"     OFF)
+option(LLAMA_CUDA_FORCE_MMQ                  "llama: use mmq kernels instead of cuBLAS"         OFF)
 set(LLAMA_CUDA_DMMV_X      "32" CACHE STRING "llama: x stride for dmmv CUDA kernels")
 set(LLAMA_CUDA_MMV_Y        "1" CACHE STRING "llama: y block size for mmv CUDA kernels")
 option(LLAMA_CUDA_F16                        "llama: use 16 bit floats for some calculations"   OFF)
@@ -93,46 +94,12 @@ option(LLAMA_CLBLAST                         "llama: use CLBlast"
 option(LLAMA_METAL                           "llama: use Metal"                                 ${LLAMA_METAL_DEFAULT})
 option(LLAMA_METAL_NDEBUG                    "llama: disable Metal debugging"                   OFF)
 option(LLAMA_MPI                             "llama: use MPI"                                   OFF)
-option(LLAMA_K_QUANTS                        "llama: use k-quants"                              ON)
 option(LLAMA_QKK_64                          "llama: use super-block size of 64 for k-quants"   OFF)
 
 option(LLAMA_BUILD_TESTS                "llama: build tests"    ${LLAMA_STANDALONE})
 option(LLAMA_BUILD_EXAMPLES             "llama: build examples" ${LLAMA_STANDALONE})
 option(LLAMA_BUILD_SERVER               "llama: build server example"                           ON)
 
-#
-# Build info header
-#
-
-# Generate initial build-info.h
-include(${CMAKE_CURRENT_SOURCE_DIR}/scripts/build-info.cmake)
-
-if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/.git")
-    set(GIT_DIR "${CMAKE_CURRENT_SOURCE_DIR}/.git")
-
-    # Is git submodule
-    if(NOT IS_DIRECTORY "${GIT_DIR}")
-        file(READ ${GIT_DIR} REAL_GIT_DIR_LINK)
-        string(REGEX REPLACE "gitdir: (.*)\n$" "\\1" REAL_GIT_DIR ${REAL_GIT_DIR_LINK})
-        set(GIT_DIR "${CMAKE_CURRENT_SOURCE_DIR}/${REAL_GIT_DIR}")
-    endif()
-
-    # Add a custom target for build-info.h
-    add_custom_target(BUILD_INFO ALL DEPENDS "${CMAKE_CURRENT_SOURCE_DIR}/build-info.h")
-
-    # Add a custom command to rebuild build-info.h when .git/index changes
-    add_custom_command(
-        OUTPUT "${CMAKE_CURRENT_SOURCE_DIR}/build-info.h"
-        COMMENT "Generating build details from Git"
-        COMMAND ${CMAKE_COMMAND} -DMSVC=${MSVC} -DCMAKE_C_COMPILER_VERSION=${CMAKE_C_COMPILER_VERSION} -DCMAKE_C_COMPILER_ID=${CMAKE_C_COMPILER_ID} -DCMAKE_VS_PLATFORM_NAME=${CMAKE_VS_PLATFORM_NAME} -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} -P "${CMAKE_CURRENT_SOURCE_DIR}/scripts/build-info.cmake"
-        WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
-        DEPENDS "${GIT_DIR}/index"
-        VERBATIM
-    )
-else()
-    message(WARNING "Git repository not found; to enable automatic generation of build info, make sure Git is installed and the project is a Git repository.")
-endif()
-
 #
 # Compile flags
 #
@@ -277,13 +244,8 @@ if (LLAMA_BLAS)
     endif()
 endif()
 
-if (LLAMA_K_QUANTS)
-    set(GGML_HEADERS_EXTRA k_quants.h)
-    set(GGML_SOURCES_EXTRA k_quants.c)
-    add_compile_definitions(GGML_USE_K_QUANTS)
-    if (LLAMA_QKK_64)
-        add_compile_definitions(GGML_QKK_64)
-    endif()
+if (LLAMA_QKK_64)
+    add_compile_definitions(GGML_QKK_64)
 endif()
 
 if (LLAMA_CUBLAS)
@@ -305,6 +267,9 @@ if (LLAMA_CUBLAS)
         if (LLAMA_CUDA_FORCE_DMMV)
             add_compile_definitions(GGML_CUDA_FORCE_DMMV)
         endif()
+        if (LLAMA_CUDA_FORCE_MMQ)
+            add_compile_definitions(GGML_CUDA_FORCE_MMQ)
+        endif()
         add_compile_definitions(GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X})
         add_compile_definitions(GGML_CUDA_MMV_Y=${LLAMA_CUDA_MMV_Y})
         if (DEFINED LLAMA_CUDA_DMMV_Y)
@@ -331,6 +296,7 @@ if (LLAMA_CUBLAS)
             set(CMAKE_CUDA_ARCHITECTURES "60;61;70") # needed for f16 CUDA intrinsics
         else()
             set(CMAKE_CUDA_ARCHITECTURES "52;61;70") # lowest CUDA 12 standard + lowest for integer intrinsics
+            #set(CMAKE_CUDA_ARCHITECTURES "") # use this to compile much faster, but only F16 models work
         endif()
     endif()
     message(STATUS "Using CUDA architectures: ${CMAKE_CUDA_ARCHITECTURES}")
@@ -404,6 +370,9 @@ if (LLAMA_HIPBLAS)
         if (LLAMA_CUDA_FORCE_DMMV)
             target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_FORCE_DMMV)
         endif()
+        if (LLAMA_CUDA_FORCE_MMQ)
+            target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_FORCE_MMQ)
+        endif()
         target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X})
         target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_MMV_Y=${LLAMA_CUDA_MMV_Y})
         target_compile_definitions(ggml-rocm PRIVATE K_QUANTS_PER_ITERATION=${LLAMA_CUDA_KQUANTS_ITER})
@@ -422,8 +391,7 @@ endif()
 if (LLAMA_ALL_WARNINGS)
     if (NOT MSVC)
         set(warning_flags -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function)
-        set(c_flags -Wshadow -Wstrict-prototypes -Wpointer-arith -Wmissing-prototypes -Werror=implicit-int
-            -Werror=implicit-function-declaration)
+        set(c_flags -Wshadow -Wstrict-prototypes -Wpointer-arith -Wmissing-prototypes -Werror=implicit-int -Werror=implicit-function-declaration)
         set(cxx_flags -Wmissing-declarations -Wmissing-noreturn)
         set(host_cxx_flags "")
 
@@ -455,7 +423,8 @@ if (LLAMA_ALL_WARNINGS)
     set(c_flags   ${c_flags}   ${warning_flags})
     set(cxx_flags ${cxx_flags} ${warning_flags})
     add_compile_options("$<$<COMPILE_LANGUAGE:C>:${c_flags}>"
-                        "$<$<COMPILE_LANGUAGE:CXX>:${cxx_flags} ${host_cxx_flags}>")
+                        "$<$<COMPILE_LANGUAGE:CXX>:${cxx_flags}>"
+                        "$<$<COMPILE_LANGUAGE:CXX>:${host_cxx_flags}>")
 
 endif()
 
@@ -665,6 +634,8 @@ add_library(ggml OBJECT
             ggml-alloc.h
             ggml-backend.c
             ggml-backend.h
+            ggml-quants.c
+            ggml-quants.h
             ${GGML_SOURCES_CUDA} ${GGML_HEADERS_CUDA}
             ${GGML_SOURCES_OPENCL} ${GGML_HEADERS_OPENCL}
             ${GGML_SOURCES_METAL} ${GGML_HEADERS_METAL}
diff --git a/Makefile b/Makefile
index 705fa1eff..300c1e6c7 100644
--- a/Makefile
+++ b/Makefile
@@ -1,7 +1,7 @@
 # Define the default target now so that it is always the first target
 BUILD_TARGETS = \
 	main quantize quantize-stats perplexity embedding vdot q8dot train-text-from-scratch convert-llama2c-to-ggml \
-	simple batched batched-bench save-load-state server embd-input-test gguf llama-bench baby-llama beam-search  \
+	simple batched batched-bench save-load-state server gguf llama-bench llava baby-llama beam-search  \
 	speculative infill benchmark-matmult parallel finetune export-lora tests/test-c.o
 
 # Binaries only useful for tests
@@ -342,13 +342,9 @@ else
 	MK_CXXFLAGS += -march=rv64gcv -mabi=lp64d
 endif
 
-ifndef LLAMA_NO_K_QUANTS
-	MK_CPPFLAGS += -DGGML_USE_K_QUANTS
-	OBJS     += k_quants.o
 ifdef LLAMA_QKK_64
 	MK_CPPFLAGS += -DGGML_QKK_64
 endif
-endif
 
 ifndef LLAMA_NO_ACCELERATE
 	# Mac OS - include Accelerate framework.
@@ -365,7 +361,7 @@ ifdef LLAMA_MPI
 	MK_CPPFLAGS += -DGGML_USE_MPI
 	MK_CFLAGS   += -Wno-cast-qual
 	MK_CXXFLAGS += -Wno-cast-qual
-	OBJS     += ggml-mpi.o
+	OBJS        += ggml-mpi.o
 endif # LLAMA_MPI
 
 ifdef LLAMA_OPENBLAS
@@ -382,7 +378,7 @@ endif # LLAMA_BLIS
 ifdef LLAMA_CUBLAS
 	MK_CPPFLAGS  += -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/x86_64-linux/include
 	MK_LDFLAGS   += -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/opt/cuda/lib64 -L$(CUDA_PATH)/targets/x86_64-linux/lib
-	OBJS      += ggml-cuda.o
+	OBJS         += ggml-cuda.o
 	NVCCFLAGS = --forward-unknown-to-host-compiler -use_fast_math
 ifdef LLAMA_CUDA_NVCC
 	NVCC = $(LLAMA_CUDA_NVCC)
@@ -397,6 +393,9 @@ endif # CUDA_DOCKER_ARCH
 ifdef LLAMA_CUDA_FORCE_DMMV
 	NVCCFLAGS += -DGGML_CUDA_FORCE_DMMV
 endif # LLAMA_CUDA_FORCE_DMMV
+ifdef LLAMA_CUDA_FORCE_MMQ
+	NVCCFLAGS += -DGGML_CUDA_FORCE_MMQ
+endif # LLAMA_CUDA_FORCE_MMQ
 ifdef LLAMA_CUDA_DMMV_X
 	NVCCFLAGS += -DGGML_CUDA_DMMV_X=$(LLAMA_CUDA_DMMV_X)
 else
@@ -494,11 +493,6 @@ ggml-mpi.o: ggml-mpi.c ggml-mpi.h
 	$(CC) $(CFLAGS) -c $< -o $@
 endif # LLAMA_MPI
 
-ifndef LLAMA_NO_K_QUANTS
-k_quants.o: k_quants.c k_quants.h
-	$(CC) $(CFLAGS) -c $< -o $@
-endif # LLAMA_NO_K_QUANTS
-
 # combine build flags with cmdline overrides
 override CFLAGS        := $(MK_CPPFLAGS) $(CPPFLAGS) $(MK_CFLAGS) $(CFLAGS)
 override CXXFLAGS      := $(MK_CPPFLAGS) $(CPPFLAGS) $(MK_CXXFLAGS) $(CXXFLAGS)
@@ -539,13 +533,16 @@ ggml-alloc.o: ggml-alloc.c ggml.h ggml-alloc.h
 ggml-backend.o: ggml-backend.c ggml.h ggml-backend.h
 	$(CC)  $(CFLAGS)   -c $< -o $@
 
-OBJS += ggml-alloc.o ggml-backend.o
+ggml-quants.o: ggml-quants.c ggml.h ggml-quants.h
+	$(CC) $(CFLAGS)    -c $< -o $@
+
+OBJS += ggml-alloc.o ggml-backend.o ggml-quants.o
 
 llama.o: llama.cpp ggml.h ggml-alloc.h ggml-backend.h ggml-cuda.h ggml-metal.h llama.h
 	$(CXX) $(CXXFLAGS) -c $< -o $@
 
-COMMON_H_DEPS = common/common.h common/sampling.h build-info.h common/log.h
-COMMON_DEPS   = $(COMMON_H_DEPS) common.o sampling.o
+COMMON_H_DEPS = common/common.h common/sampling.h common/log.h
+COMMON_DEPS   = common.o sampling.o grammar-parser.o build-info.o
 
 common.o: common/common.cpp $(COMMON_H_DEPS)
 	$(CXX) $(CXXFLAGS) -c $< -o $@
@@ -566,54 +563,47 @@ libllama.so: llama.o ggml.o $(OBJS)
 	$(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS)
 
 clean:
-	rm -vrf *.o tests/*.o *.so *.dll benchmark-matmult build-info.h *.dot $(COV_TARGETS) $(BUILD_TARGETS) $(TEST_TARGETS)
+	rm -vrf *.o tests/*.o *.so *.dll benchmark-matmult common/build-info.cpp *.dot $(COV_TARGETS) $(BUILD_TARGETS) $(TEST_TARGETS)
 
 #
 # Examples
 #
 
-main: examples/main/main.cpp                                  build-info.h ggml.o llama.o $(COMMON_DEPS) console.o grammar-parser.o $(OBJS)
+main: examples/main/main.cpp                                  ggml.o llama.o $(COMMON_DEPS) console.o grammar-parser.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 	@echo
 	@echo '====  Run ./main -h for help.  ===='
 	@echo
 
-infill: examples/infill/infill.cpp                            build-info.h ggml.o llama.o $(COMMON_DEPS) console.o grammar-parser.o $(OBJS)
+infill: examples/infill/infill.cpp                            ggml.o llama.o $(COMMON_DEPS) console.o grammar-parser.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 
-simple: examples/simple/simple.cpp                            build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
+simple: examples/simple/simple.cpp                            ggml.o llama.o $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 
-batched: examples/batched/batched.cpp                         build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
+batched: examples/batched/batched.cpp                         ggml.o llama.o $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 
-batched-bench: examples/batched-bench/batched-bench.cpp       build-info.h ggml.o llama.o common.o $(OBJS)
+batched-bench: examples/batched-bench/batched-bench.cpp       build-info.o ggml.o llama.o common.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 
-quantize: examples/quantize/quantize.cpp                      build-info.h ggml.o llama.o $(OBJS)
+quantize: examples/quantize/quantize.cpp                      build-info.o ggml.o llama.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 
-quantize-stats: examples/quantize-stats/quantize-stats.cpp    build-info.h ggml.o llama.o $(OBJS)
+quantize-stats: examples/quantize-stats/quantize-stats.cpp    build-info.o ggml.o llama.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 
-perplexity: examples/perplexity/perplexity.cpp                build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
+perplexity: examples/perplexity/perplexity.cpp                ggml.o llama.o $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 
-embedding: examples/embedding/embedding.cpp                   build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
+embedding: examples/embedding/embedding.cpp                   ggml.o llama.o $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 
-save-load-state: examples/save-load-state/save-load-state.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
+save-load-state: examples/save-load-state/save-load-state.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 
-server: examples/server/server.cpp examples/server/httplib.h examples/server/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp build-info.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
-	$(CXX) $(CXXFLAGS) -Iexamples/server $(filter-out %.h,$(filter-out %.hpp,$^)) -o $@ $(LDFLAGS) $(LWINSOCK2)
-
-$(LIB_PRE)embdinput$(DSO_EXT): examples/embd-input/embd-input.h examples/embd-input/embd-input-lib.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
-	$(CXX) --shared $(CXXFLAGS) $(filter-out %.h,$(filter-out %.hpp,$^)) -o $@ $(LDFLAGS)
-
-
-embd-input-test: $(LIB_PRE)embdinput$(DSO_EXT) examples/embd-input/embd-input-test.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
-	$(CXX) $(CXXFLAGS) $(filter-out %$(DSO_EXT),$(filter-out %.h,$(filter-out %.hpp,$^))) -o $@ $(LDFLAGS) -L. -lembdinput
+server: examples/server/server.cpp examples/server/httplib.h examples/server/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp examples/llava/clip.cpp examples/llava/clip.h common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
+	$(CXX) $(CXXFLAGS) -Iexamples/server $(filter-out %.h,$(filter-out %.hpp,$^)) -o $@ $(LDFLAGS) $(LWINSOCK2) -Wno-cast-qual
 
 gguf: examples/gguf/gguf.cpp ggml.o llama.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
@@ -624,25 +614,28 @@ train-text-from-scratch: examples/train-text-from-scratch/train-text-from-scratc
 convert-llama2c-to-ggml: examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp ggml.o llama.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 
-llama-bench: examples/llama-bench/llama-bench.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
+llama-bench: examples/llama-bench/llama-bench.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 
+llava: examples/llava/llava.cpp examples/llava/llava-utils.h examples/llava/clip.cpp examples/llava/clip.h common/stb_image.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
+	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) -Wno-cast-qual
+
 baby-llama: examples/baby-llama/baby-llama.cpp ggml.o llama.o $(COMMON_DEPS) train.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 
-beam-search: examples/beam-search/beam-search.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
+beam-search: examples/beam-search/beam-search.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 
-finetune: examples/finetune/finetune.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) train.o $(OBJS)
+finetune: examples/finetune/finetune.cpp ggml.o llama.o $(COMMON_DEPS) train.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 
-export-lora: examples/export-lora/export-lora.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
+export-lora: examples/export-lora/export-lora.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 
-speculative: examples/speculative/speculative.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
+speculative: examples/speculative/speculative.cpp ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 
-parallel: examples/parallel/parallel.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
+parallel: examples/parallel/parallel.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 
 ifdef LLAMA_METAL
@@ -655,7 +648,7 @@ swift: examples/batched.swift
 	(cd examples/batched.swift; make build)
 endif
 
-build-info.h: $(wildcard .git/index) scripts/build-info.sh
+common/build-info.cpp: $(wildcard .git/index) scripts/build-info.sh
 	@sh scripts/build-info.sh $(CC) > $@.tmp
 	@if ! cmp -s $@.tmp $@; then \
 		mv $@.tmp $@; \
@@ -663,13 +656,16 @@ build-info.h: $(wildcard .git/index) scripts/build-info.sh
 		rm $@.tmp; \
 	fi
 
+build-info.o: common/build-info.cpp
+	$(CXX) $(CXXFLAGS) -c $(filter-out %.h,$^) -o $@
+
 #
 # Tests
 #
 
 tests: $(TEST_TARGETS)
 
-benchmark-matmult: examples/benchmark/benchmark-matmult.cpp build-info.h ggml.o $(OBJS)
+benchmark-matmult: examples/benchmark/benchmark-matmult.cpp build-info.o ggml.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 
 run-benchmark-matmult: benchmark-matmult
@@ -683,40 +679,40 @@ vdot: pocs/vdot/vdot.cpp ggml.o $(OBJS)
 q8dot: pocs/vdot/q8dot.cpp ggml.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
 
-tests/test-llama-grammar: tests/test-llama-grammar.cpp build-info.h ggml.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
+tests/test-llama-grammar: tests/test-llama-grammar.cpp ggml.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 
-tests/test-grammar-parser: tests/test-grammar-parser.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
+tests/test-grammar-parser: tests/test-grammar-parser.cpp ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 
-tests/test-double-float: tests/test-double-float.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
+tests/test-double-float: tests/test-double-float.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 
-tests/test-grad0: tests/test-grad0.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
+tests/test-grad0: tests/test-grad0.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 
-tests/test-opt: tests/test-opt.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
+tests/test-opt: tests/test-opt.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 
-tests/test-quantize-fns: tests/test-quantize-fns.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
+tests/test-quantize-fns: tests/test-quantize-fns.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 
-tests/test-quantize-perf: tests/test-quantize-perf.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
+tests/test-quantize-perf: tests/test-quantize-perf.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 
-tests/test-sampling: tests/test-sampling.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
+tests/test-sampling: tests/test-sampling.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 
-tests/test-tokenizer-0-falcon: tests/test-tokenizer-0-falcon.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
+tests/test-tokenizer-0-falcon: tests/test-tokenizer-0-falcon.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 
-tests/test-tokenizer-0-llama: tests/test-tokenizer-0-llama.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
+tests/test-tokenizer-0-llama: tests/test-tokenizer-0-llama.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 
-tests/test-tokenizer-1-bpe: tests/test-tokenizer-1-bpe.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
+tests/test-tokenizer-1-bpe: tests/test-tokenizer-1-bpe.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 
-tests/test-tokenizer-1-llama: tests/test-tokenizer-1-llama.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
+tests/test-tokenizer-1-llama: tests/test-tokenizer-1-llama.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 
 tests/test-c.o: tests/test-c.c llama.h
diff --git a/Package.swift b/Package.swift
index 4ab055b19..5b3bd72ca 100644
--- a/Package.swift
+++ b/Package.swift
@@ -42,13 +42,12 @@ let package = Package(
                 "llama.cpp",
                 "ggml-alloc.c",
                 "ggml-backend.c",
-                "k_quants.c",
+                "ggml-quants.c",
             ] + additionalSources,
             resources: resources,
             publicHeadersPath: "spm-headers",
             cSettings: [
                 .unsafeFlags(["-Wno-shorten-64-to-32", "-O3", "-DNDEBUG"]),
-                .define("GGML_USE_K_QUANTS"),
                 .define("GGML_USE_ACCELERATE")
                 // NOTE: NEW_LAPACK will required iOS version 16.4+
                 // We should consider add this in the future when we drop support for iOS 14
diff --git a/README.md b/README.md
index 0f1fd7565..9c9e36ad0 100644
--- a/README.md
+++ b/README.md
@@ -2,7 +2,6 @@
 
 ![llama](https://user-images.githubusercontent.com/1991296/230134379-7181e485-c521-4d23-a0d6-f7b3b61ba524.png)
 
-[![Actions Status](https://github.com/ggerganov/llama.cpp/workflows/CI/badge.svg)](https://github.com/ggerganov/llama.cpp/actions)
 [![License: MIT](https://img.shields.io/badge/license-MIT-blue.svg)](https://opensource.org/licenses/MIT)
 
 [Roadmap](https://github.com/users/ggerganov/projects/7) / [Project status](https://github.com/ggerganov/llama.cpp/discussions/3471) / [Manifesto](https://github.com/ggerganov/llama.cpp/discussions/205) / [ggml](https://github.com/ggerganov/ggml)
@@ -11,12 +10,7 @@ Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++
 
 ### Hot topics
 
-- ‼️ Breaking change: `rope_freq_base` and `rope_freq_scale` must be set to zero to use the model default values: [#3401](https://github.com/ggerganov/llama.cpp/pull/3401)
-- Parallel decoding + continuous batching support added: [#3228](https://github.com/ggerganov/llama.cpp/pull/3228) \
-  **Devs should become familiar with the new API**
-- Local Falcon 180B inference on Mac Studio
-
-  https://github.com/ggerganov/llama.cpp/assets/1991296/98abd4e8-7077-464c-ae89-aebabca7757e
+- ⚠️ **Upcoming change that might break functionality. Help with testing is needed:** https://github.com/ggerganov/llama.cpp/pull/3912
 
 ----
 
@@ -89,21 +83,23 @@ as the main playground for developing new features for the [ggml](https://github
 - [X] [Vicuna](https://github.com/ggerganov/llama.cpp/discussions/643#discussioncomment-5533894)
 - [X] [Koala](https://bair.berkeley.edu/blog/2023/04/03/koala/)
 - [X] [OpenBuddy 🐶 (Multilingual)](https://github.com/OpenBuddy/OpenBuddy)
-- [X] [Pygmalion 7B / Metharme 7B](#using-pygmalion-7b--metharme-7b)
+- [X] [Pygmalion/Metharme](#using-pygmalion-7b--metharme-7b)
 - [X] [WizardLM](https://github.com/nlpxucan/WizardLM)
-- [X] [Baichuan-7B](https://huggingface.co/baichuan-inc/baichuan-7B) and its derivations (such as [baichuan-7b-sft](https://huggingface.co/hiyouga/baichuan-7b-sft))
-- [X] [Aquila-7B](https://huggingface.co/BAAI/Aquila-7B) / [AquilaChat-7B](https://huggingface.co/BAAI/AquilaChat-7B)
+- [X] [Baichuan 1 & 2](https://huggingface.co/models?search=baichuan-inc/Baichuan) + [derivations](https://huggingface.co/hiyouga/baichuan-7b-sft)
+- [X] [Aquila 1 & 2](https://huggingface.co/models?search=BAAI/Aquila)
 - [X] [Starcoder models](https://github.com/ggerganov/llama.cpp/pull/3187)
 - [X] [Mistral AI v0.1](https://huggingface.co/mistralai/Mistral-7B-v0.1)
 - [X] [Refact](https://huggingface.co/smallcloudai/Refact-1_6B-fim)
-- [X] [Bloom](https://github.com/ggerganov/llama.cpp/pull/3553)
+- [X] [Persimmon 8B](https://github.com/ggerganov/llama.cpp/pull/3410)
 - [X] [MPT](https://github.com/ggerganov/llama.cpp/pull/3417)
+- [X] [Bloom](https://github.com/ggerganov/llama.cpp/pull/3553)
+
 
 **Bindings:**
 
 - Python: [abetlen/llama-cpp-python](https://github.com/abetlen/llama-cpp-python)
 - Go: [go-skynet/go-llama.cpp](https://github.com/go-skynet/go-llama.cpp)
-- Node.js: [withcatai/node-llama-cpp](https://github.com/withcatai/node-llama-cpp), [hlhr202/llama-node](https://github.com/hlhr202/llama-node)
+- Node.js: [withcatai/node-llama-cpp](https://github.com/withcatai/node-llama-cpp)
 - Ruby: [yoshoku/llama_cpp.rb](https://github.com/yoshoku/llama_cpp.rb)
 - Rust: [mdrokz/rust-llama.cpp](https://github.com/mdrokz/rust-llama.cpp)
 - C#/.NET: [SciSharp/LLamaSharp](https://github.com/SciSharp/LLamaSharp)
@@ -206,7 +202,7 @@ https://user-images.githubusercontent.com/1991296/224442907-7693d4be-acaa-4e01-8
 
 ## Usage
 
-Here are the steps for the LLaMA-7B model.
+Here are the end-to-end binary build and model conversion steps for the LLaMA-7B model.
 
 ### Get the Code
 
@@ -279,7 +275,7 @@ In order to build llama.cpp you have three different options.
 On MacOS, Metal is enabled by default. Using Metal makes the computation run on the GPU.
 To disable the Metal build at compile time use the `LLAMA_NO_METAL=1` flag or the `LLAMA_METAL=OFF` cmake option.
 
-When built with Metal support, you can explicitly disable GPU inference with the `--gpu-layers|-ngl 0` command-line
+When built with Metal support, you can explicitly disable GPU inference with the `--n-gpu-layers|-ngl 0` command-line
 argument.
 
 ### MPI Build
@@ -573,6 +569,18 @@ python3 convert.py models/7B/
 
 When running the larger models, make sure you have enough disk space to store all the intermediate files.
 
+### Running on Windows with prebuilt binaries
+
+You will find prebuilt Windows binaries on the release page.
+
+Simply download and extract the latest zip package of choice: (e.g. `llama-b1380-bin-win-avx2-x64.zip`)
+
+From the unzipped folder, open a terminal/cmd window here and place a pre-converted `.gguf` model file. Test out the main example like so:
+
+```
+.\main -m llama-2-7b.Q4_0.gguf -n 128
+```
+
 ### Memory/Disk Requirements
 
 As the models are currently fully loaded into memory, you will need adequate disk space to save them and sufficient RAM to load them. At the moment, memory and disk requirements are the same.
@@ -952,7 +960,6 @@ docker run --gpus all -v /path/to/models:/models local/llama.cpp:light-cuda -m /
 
 - [main](./examples/main/README.md)
 - [server](./examples/server/README.md)
-- [embd-input](./examples/embd-input/README.md)
 - [jeopardy](./examples/jeopardy/README.md)
 - [BLIS](./docs/BLIS.md)
 - [Performance troubleshooting](./docs/token_generation_performance_tips.md)
diff --git a/build.zig b/build.zig
index 0b74cee48..699738f3d 100644
--- a/build.zig
+++ b/build.zig
@@ -10,7 +10,6 @@ const Maker = struct {
     builder: *std.build.Builder,
     target: CrossTarget,
     optimize: Mode,
-    config_header: *ConfigHeader,
     enable_lto: bool,
 
     include_dirs: ArrayList([]const u8),
@@ -41,26 +40,24 @@ const Maker = struct {
         const commit_hash = try std.ChildProcess.exec(
             .{ .allocator = builder.allocator, .argv = &.{ "git", "rev-parse", "HEAD" } },
         );
-        const config_header = builder.addConfigHeader(
-            .{ .style = .blank, .include_path = "build-info.h" },
-            .{
-                .BUILD_NUMBER = 0,
-                .BUILD_COMMIT = commit_hash.stdout[0 .. commit_hash.stdout.len - 1], // omit newline
-                .BUILD_COMPILER = builder.fmt("Zig {s}", .{zig_version}),
-                .BUILD_TARGET = try target.allocDescription(builder.allocator),
-            },
-        );
+        try std.fs.cwd().writeFile("common/build-info.cpp", builder.fmt(
+            \\int LLAMA_BUILD_NUMBER = {};
+            \\char const *LLAMA_COMMIT = "{s}";
+            \\char const *LLAMA_COMPILER = "Zig {s}";
+            \\char const *LLAMA_BUILD_TARGET = "{s}";
+            \\
+        , .{ 0, commit_hash.stdout[0 .. commit_hash.stdout.len - 1], zig_version, try target.allocDescription(builder.allocator) }));
         var m = Maker{
             .builder = builder,
             .target = target,
             .optimize = builder.standardOptimizeOption(.{}),
-            .config_header = config_header,
             .enable_lto = false,
             .include_dirs = ArrayList([]const u8).init(builder.allocator),
             .cflags = ArrayList([]const u8).init(builder.allocator),
             .cxxflags = ArrayList([]const u8).init(builder.allocator),
             .objs = ArrayList(*Compile).init(builder.allocator),
         };
+
         try m.addCFlag("-std=c11");
         try m.addCxxFlag("-std=c++11");
         try m.addProjectInclude(&.{});
@@ -72,7 +69,7 @@ const Maker = struct {
         const o = m.builder.addObject(.{ .name = name, .target = m.target, .optimize = m.optimize });
         if (o.target.getAbi() != .msvc)
             o.defineCMacro("_GNU_SOURCE", null);
-        o.addConfigHeader(m.config_header);
+
         if (std.mem.endsWith(u8, src, ".c")) {
             o.addCSourceFiles(&.{src}, m.cflags.items);
             o.linkLibC();
@@ -85,7 +82,6 @@ const Maker = struct {
                 o.linkLibCpp();
             }
         }
-        o.addConfigHeader(m.config_header);
         for (m.include_dirs.items) |i| o.addIncludePath(.{ .path = i });
         o.want_lto = m.enable_lto;
         return o;
@@ -105,7 +101,6 @@ const Maker = struct {
             // linkLibCpp already add (libc++ + libunwind + libc)
             e.linkLibCpp();
         }
-        e.addConfigHeader(m.config_header);
         m.builder.installArtifact(e);
         e.want_lto = m.enable_lto;
         return e;
@@ -116,30 +111,27 @@ pub fn build(b: *std.build.Builder) !void {
     var make = try Maker.init(b);
     make.enable_lto = b.option(bool, "lto", "Enable LTO optimization, (default: false)") orelse false;
 
-    if (b.option(bool, "k-quants", "Enable K-quants, (default: true)") orelse true) {
-        try make.addFlag("-DGGML_USE_K_QUANTS");
-        const k_quants = make.obj("k_quants", "k_quants.c");
-        try make.objs.append(k_quants);
-    }
-
     const ggml = make.obj("ggml", "ggml.c");
     const ggml_alloc = make.obj("ggml-alloc", "ggml-alloc.c");
     const ggml_backend = make.obj("ggml-backend", "ggml-backend.c");
+    const ggml_quants = make.obj("ggml-quants", "ggml-quants.c");
     const llama = make.obj("llama", "llama.cpp");
+    const buildinfo = make.obj("common", "common/build-info.cpp");
     const common = make.obj("common", "common/common.cpp");
     const console = make.obj("console", "common/console.cpp");
     const sampling = make.obj("sampling", "common/sampling.cpp");
     const grammar_parser = make.obj("grammar-parser", "common/grammar-parser.cpp");
     const train = make.obj("train", "common/train.cpp");
+    const clip = make.obj("clip", "examples/llava/clip.cpp");
 
-    _ = make.exe("main", "examples/main/main.cpp", &.{ ggml, ggml_alloc, ggml_backend, llama, common, sampling, console, grammar_parser });
-    _ = make.exe("quantize", "examples/quantize/quantize.cpp", &.{ ggml, ggml_alloc, ggml_backend, llama, common });
-    _ = make.exe("perplexity", "examples/perplexity/perplexity.cpp", &.{ ggml, ggml_alloc, ggml_backend, llama, common });
-    _ = make.exe("embedding", "examples/embedding/embedding.cpp", &.{ ggml, ggml_alloc, ggml_backend, llama, common });
-    _ = make.exe("finetune", "examples/finetune/finetune.cpp", &.{ ggml, ggml_alloc, ggml_backend, llama, common, train });
-    _ = make.exe("train-text-from-scratch", "examples/train-text-from-scratch/train-text-from-scratch.cpp", &.{ ggml, ggml_alloc, ggml_backend, llama, common, train });
+    _ = make.exe("main", "examples/main/main.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, buildinfo, sampling, console, grammar_parser });
+    _ = make.exe("quantize", "examples/quantize/quantize.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, buildinfo });
+    _ = make.exe("perplexity", "examples/perplexity/perplexity.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, buildinfo });
+    _ = make.exe("embedding", "examples/embedding/embedding.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, buildinfo });
+    _ = make.exe("finetune", "examples/finetune/finetune.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, buildinfo, train });
+    _ = make.exe("train-text-from-scratch", "examples/train-text-from-scratch/train-text-from-scratch.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, buildinfo, train });
 
-    const server = make.exe("server", "examples/server/server.cpp", &.{ ggml, ggml_alloc, ggml_backend, llama, common, sampling, grammar_parser });
+    const server = make.exe("server", "examples/server/server.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, buildinfo, sampling, grammar_parser, clip });
     if (server.target.isWindows()) {
         server.linkSystemLibrary("ws2_32");
     }
diff --git a/ci/run.sh b/ci/run.sh
index 942b2e00c..2e3343831 100755
--- a/ci/run.sh
+++ b/ci/run.sh
@@ -208,6 +208,8 @@ function gg_run_open_llama_3b_v2 {
     (time ./bin/perplexity --model ${model_q5_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
     (time ./bin/perplexity --model ${model_q6_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
 
+    (time ./bin/save-load-state --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
+
     function check_ppl {
         qnt="$1"
         ppl=$(echo "$2" | grep -oE "[0-9]+\.[0-9]+" | tail -n 1)
@@ -296,6 +298,7 @@ function gg_sum_open_llama_3b_v2 {
     gg_printf '- q4_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_k.log)"
     gg_printf '- q5_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_k.log)"
     gg_printf '- q6_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q6_k.log)"
+    gg_printf '- save-load-state: \n```\n%s\n```\n' "$(cat $OUT/${ci}-save-load-state.log)"
     gg_printf '- shakespeare (f16):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-f16.log)"
     gg_printf '- shakespeare (f16 lora):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-lora-f16.log)"
     gg_printf '- shakespeare (q8_0):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-q8_0.log)"
@@ -382,6 +385,8 @@ function gg_run_open_llama_7b_v2 {
     (time ./bin/perplexity --model ${model_q5_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
     (time ./bin/perplexity --model ${model_q6_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
 
+    (time ./bin/save-load-state --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
+
     function check_ppl {
         qnt="$1"
         ppl=$(echo "$2" | grep -oE "[0-9]+\.[0-9]+" | tail -n 1)
@@ -470,6 +475,7 @@ function gg_sum_open_llama_7b_v2 {
     gg_printf '- q4_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_k.log)"
     gg_printf '- q5_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_k.log)"
     gg_printf '- q6_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q6_k.log)"
+    gg_printf '- save-load-state: \n```\n%s\n```\n' "$(cat $OUT/${ci}-save-load-state.log)"
     gg_printf '- shakespeare (f16):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-f16.log)"
     gg_printf '- shakespeare (f16 lora):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-lora-f16.log)"
     #gg_printf '- shakespeare (q8_0):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-q8_0.log)"
@@ -496,10 +502,12 @@ test $ret -eq 0 && gg_run ctest_debug
 test $ret -eq 0 && gg_run ctest_release
 
 if [ -z ${GG_BUILD_LOW_PERF} ]; then
-    if [ -z ${GG_BUILD_CUDA} ]; then
-        test $ret -eq 0 && gg_run open_llama_3b_v2
-    else
-        test $ret -eq 0 && gg_run open_llama_7b_v2
+    if [ -z ${GG_BUILD_VRAM_GB} ] || [ ${GG_BUILD_VRAM_GB} -ge 8 ]; then
+        if [ -z ${GG_BUILD_CUDA} ]; then
+            test $ret -eq 0 && gg_run open_llama_3b_v2
+        else
+            test $ret -eq 0 && gg_run open_llama_7b_v2
+        fi
     fi
 fi
 
diff --git a/common/CMakeLists.txt b/common/CMakeLists.txt
index fbb0ff095..ac594b2ca 100644
--- a/common/CMakeLists.txt
+++ b/common/CMakeLists.txt
@@ -1,8 +1,46 @@
 # common
 
+
+# Build info header
+#
+
+if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/../.git")
+    set(GIT_DIR "${CMAKE_CURRENT_SOURCE_DIR}/../.git")
+
+    # Is git submodule
+    if(NOT IS_DIRECTORY "${GIT_DIR}")
+        file(READ ${GIT_DIR} REAL_GIT_DIR_LINK)
+        string(REGEX REPLACE "gitdir: (.*)\n$" "\\1" REAL_GIT_DIR ${REAL_GIT_DIR_LINK})
+        set(GIT_DIR "${CMAKE_CURRENT_SOURCE_DIR}/../${REAL_GIT_DIR}")
+    endif()
+
+    set(GIT_INDEX "${GIT_DIR}/index")
+else()
+    message(WARNING "Git repository not found; to enable automatic generation of build info, make sure Git is installed and the project is a Git repository.")
+    set(GIT_INDEX "")
+endif()
+
+# Add a custom command to rebuild build-info.cpp when .git/index changes
+add_custom_command(
+    OUTPUT "${CMAKE_CURRENT_SOURCE_DIR}/build-info.cpp"
+    COMMENT "Generating build details from Git"
+    COMMAND ${CMAKE_COMMAND} -DMSVC=${MSVC} -DCMAKE_C_COMPILER_VERSION=${CMAKE_C_COMPILER_VERSION}
+            -DCMAKE_C_COMPILER_ID=${CMAKE_C_COMPILER_ID} -DCMAKE_VS_PLATFORM_NAME=${CMAKE_VS_PLATFORM_NAME}
+            -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} -P "${CMAKE_CURRENT_SOURCE_DIR}/../scripts/build-info.cmake"
+    WORKING_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}/.."
+    DEPENDS "${CMAKE_CURRENT_SOURCE_DIR}/build-info.cpp.in" ${GIT_INDEX}
+    VERBATIM
+)
+set(TARGET build_info)
+add_library(${TARGET} OBJECT build-info.cpp)
+if (BUILD_SHARED_LIBS)
+    set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
+endif()
+
+
 set(TARGET common)
 
-add_library(${TARGET} OBJECT
+add_library(${TARGET} STATIC
     common.h
     common.cpp
     sampling.h
@@ -21,4 +59,4 @@ endif()
 
 target_include_directories(${TARGET} PUBLIC .)
 target_compile_features(${TARGET} PUBLIC cxx_std_11)
-target_link_libraries(${TARGET} PRIVATE llama)
+target_link_libraries(${TARGET} PRIVATE llama build_info)
diff --git a/common/build-info.cpp.in b/common/build-info.cpp.in
new file mode 100644
index 000000000..0b945aa68
--- /dev/null
+++ b/common/build-info.cpp.in
@@ -0,0 +1,4 @@
+int LLAMA_BUILD_NUMBER = @BUILD_NUMBER@;
+char const *LLAMA_COMMIT = "@BUILD_COMMIT@";
+char const *LLAMA_COMPILER = "@BUILD_COMPILER@";
+char const *LLAMA_BUILD_TARGET = "@BUILD_TARGET@";
diff --git a/common/common.cpp b/common/common.cpp
index 4214e63af..e938dee16 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -1,5 +1,4 @@
 #include "common.h"
-#include "build-info.h"
 #include "llama.h"
 
 #include <algorithm>
@@ -103,11 +102,26 @@ void process_escapes(std::string& input) {
 }
 
 bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
+    bool result = true;
+    try {
+        if (!gpt_params_parse_ex(argc, argv, params)) {
+            gpt_print_usage(argc, argv, gpt_params());
+            exit(0);
+        }
+    }
+    catch (const std::invalid_argument & ex) {
+        fprintf(stderr, "%s\n", ex.what());
+        gpt_print_usage(argc, argv, gpt_params());
+        exit(1);
+    }
+    return result;
+}
+
+bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
     bool invalid_param = false;
     std::string arg;
-    gpt_params default_params;
     const std::string arg_prefix = "--";
-    llama_sampling_params & sparams = params.sampling_params;
+    llama_sampling_params & sparams = params.sparams;
 
     for (int i = 1; i < argc; i++) {
         arg = argv[i];
@@ -204,12 +218,52 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
                 break;
             }
             params.rope_freq_scale = std::stof(argv[i]);
+        } else if (arg == "--rope-scaling") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            std::string value(argv[i]);
+            /**/ if (value == "none")   { params.rope_scaling_type = LLAMA_ROPE_SCALING_NONE; }
+            else if (value == "linear") { params.rope_scaling_type = LLAMA_ROPE_SCALING_LINEAR; }
+            else if (value == "yarn")   { params.rope_scaling_type = LLAMA_ROPE_SCALING_YARN; }
+            else { invalid_param = true; break; }
         } else if (arg == "--rope-scale") {
             if (++i >= argc) {
                 invalid_param = true;
                 break;
             }
             params.rope_freq_scale = 1.0f/std::stof(argv[i]);
+        } else if (arg == "--yarn-orig-ctx") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.yarn_orig_ctx = std::stoi(argv[i]);
+        } else if (arg == "--yarn-ext-factor") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.yarn_ext_factor = std::stof(argv[i]);
+        } else if (arg == "--yarn-attn-factor") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.yarn_attn_factor = std::stof(argv[i]);
+        } else if (arg == "--yarn-beta-fast") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.yarn_beta_fast = std::stof(argv[i]);
+        } else if (arg == "--yarn-beta-slow") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.yarn_beta_slow = std::stof(argv[i]);
         } else if (arg == "--memory-f32") {
             params.memory_f16 = false;
         } else if (arg == "--top-p") {
@@ -218,12 +272,19 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
                 break;
             }
             sparams.top_p = std::stof(argv[i]);
+        } else if (arg == "--min-p") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            sparams.min_p = std::stof(argv[i]);
         } else if (arg == "--temp") {
             if (++i >= argc) {
                 invalid_param = true;
                 break;
             }
             sparams.temp = std::stof(argv[i]);
+            sparams.temp = std::max(sparams.temp, 0.0f);
         } else if (arg == "--tfs") {
             if (++i >= argc) {
                 invalid_param = true;
@@ -241,25 +302,26 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
                 invalid_param = true;
                 break;
             }
-            sparams.repeat_last_n = std::stoi(argv[i]);
+            sparams.penalty_last_n = std::stoi(argv[i]);
+            sparams.n_prev = std::max(sparams.n_prev, sparams.penalty_last_n);
         } else if (arg == "--repeat-penalty") {
             if (++i >= argc) {
                 invalid_param = true;
                 break;
             }
-            sparams.repeat_penalty = std::stof(argv[i]);
+            sparams.penalty_repeat = std::stof(argv[i]);
         } else if (arg == "--frequency-penalty") {
             if (++i >= argc) {
                 invalid_param = true;
                 break;
             }
-            sparams.frequency_penalty = std::stof(argv[i]);
+            sparams.penalty_freq = std::stof(argv[i]);
         } else if (arg == "--presence-penalty") {
             if (++i >= argc) {
                 invalid_param = true;
                 break;
             }
-            sparams.presence_penalty = std::stof(argv[i]);
+            sparams.penalty_present = std::stof(argv[i]);
         } else if (arg == "--mirostat") {
             if (++i >= argc) {
                 invalid_param = true;
@@ -384,6 +446,18 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
                 break;
             }
             params.lora_base = argv[i];
+        } else if (arg == "--mmproj") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.mmproj = argv[i];
+        } else if (arg == "--image") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.image = argv[i];
         } else if (arg == "-i" || arg == "--interactive") {
             params.interactive = true;
         } else if (arg == "--embedding") {
@@ -534,11 +608,8 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
                 break;
             }
         } else if (arg == "-h" || arg == "--help") {
-            gpt_print_usage(argc, argv, default_params);
-#ifndef LOG_DISABLE_LOGS
-            log_print_usage();
-#endif // LOG_DISABLE_LOGS
-            exit(0);
+            return false;
+
         } else if (arg == "--random-prompt") {
             params.random_prompt = true;
         } else if (arg == "--in-prefix-bos") {
@@ -560,7 +631,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
                 invalid_param = true;
                 break;
             }
-            params.grammar = argv[i];
+            sparams.grammar = argv[i];
         } else if (arg == "--grammar-file") {
             if (++i >= argc) {
                 invalid_param = true;
@@ -575,7 +646,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
             std::copy(
                 std::istreambuf_iterator<char>(file),
                 std::istreambuf_iterator<char>(),
-                std::back_inserter(params.grammar)
+                std::back_inserter(sparams.grammar)
             );
 #ifndef LOG_DISABLE_LOGS
         // Parse args for logging parameters
@@ -597,28 +668,24 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
         // End of Parse args for logging parameters
 #endif // LOG_DISABLE_LOGS
         } else {
-            fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
-            gpt_print_usage(argc, argv, default_params);
-            exit(1);
+            throw std::invalid_argument("error: unknown argument: " + arg);
         }
     }
     if (invalid_param) {
-        fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str());
-        gpt_print_usage(argc, argv, default_params);
-        exit(1);
+        throw std::invalid_argument("error: invalid parameter for argument: " + arg);
     }
     if (params.prompt_cache_all &&
             (params.interactive || params.interactive_first ||
              params.instruct)) {
-        fprintf(stderr, "error: --prompt-cache-all not supported in interactive mode yet\n");
-        gpt_print_usage(argc, argv, default_params);
-        exit(1);
+
+        throw std::invalid_argument("error: --prompt-cache-all not supported in interactive mode yet\n");
     }
 
     if (params.escape) {
         process_escapes(params.prompt);
         process_escapes(params.input_prefix);
         process_escapes(params.input_suffix);
+        process_escapes(sparams.cfg_negative_prompt);
         for (auto & antiprompt : params.antiprompt) {
             process_escapes(antiprompt);
         }
@@ -628,8 +695,9 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
 }
 
 void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
-    const llama_sampling_params & sparams = params.sampling_params;
+    const llama_sampling_params & sparams = params.sparams;
 
+    printf("\n");
     printf("usage: %s [options]\n", argv[0]);
     printf("\n");
     printf("options:\n");
@@ -664,12 +732,13 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
     printf("  -b N, --batch-size N  batch size for prompt processing (default: %d)\n", params.n_batch);
     printf("  --top-k N             top-k sampling (default: %d, 0 = disabled)\n", sparams.top_k);
     printf("  --top-p N             top-p sampling (default: %.1f, 1.0 = disabled)\n", (double)sparams.top_p);
+    printf("  --min-p N             min-p sampling (default: %.1f, 0.0 = disabled)\n", (double)sparams.min_p);
     printf("  --tfs N               tail free sampling, parameter z (default: %.1f, 1.0 = disabled)\n", (double)sparams.tfs_z);
     printf("  --typical N           locally typical sampling, parameter p (default: %.1f, 1.0 = disabled)\n", (double)sparams.typical_p);
-    printf("  --repeat-last-n N     last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)\n", sparams.repeat_last_n);
-    printf("  --repeat-penalty N    penalize repeat sequence of tokens (default: %.1f, 1.0 = disabled)\n", (double)sparams.repeat_penalty);
-    printf("  --presence-penalty N  repeat alpha presence penalty (default: %.1f, 0.0 = disabled)\n", (double)sparams.presence_penalty);
-    printf("  --frequency-penalty N repeat alpha frequency penalty (default: %.1f, 0.0 = disabled)\n", (double)sparams.frequency_penalty);
+    printf("  --repeat-last-n N     last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)\n", sparams.penalty_last_n);
+    printf("  --repeat-penalty N    penalize repeat sequence of tokens (default: %.1f, 1.0 = disabled)\n", (double)sparams.penalty_repeat);
+    printf("  --presence-penalty N  repeat alpha presence penalty (default: %.1f, 0.0 = disabled)\n", (double)sparams.penalty_present);
+    printf("  --frequency-penalty N repeat alpha frequency penalty (default: %.1f, 0.0 = disabled)\n", (double)sparams.penalty_freq);
     printf("  --mirostat N          use Mirostat sampling.\n");
     printf("                        Top K, Nucleus, Tail Free and Locally Typical samplers are ignored if used.\n");
     printf("                        (default: %d, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)\n", sparams.mirostat);
@@ -686,9 +755,16 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
     printf("  --cfg-negative-prompt-file FNAME\n");
     printf("                        negative prompt file to use for guidance. (default: empty)\n");
     printf("  --cfg-scale N         strength of guidance (default: %f, 1.0 = disable)\n", sparams.cfg_scale);
-    printf("  --rope-scale N        RoPE context linear scaling factor, inverse of --rope-freq-scale\n");
+    printf("  --rope-scaling {none,linear,yarn}\n");
+    printf("                        RoPE frequency scaling method, defaults to linear unless specified by the model\n");
+    printf("  --rope-scale N        RoPE context scaling factor, expands context by a factor of N\n");
     printf("  --rope-freq-base N    RoPE base frequency, used by NTK-aware scaling (default: loaded from model)\n");
-    printf("  --rope-freq-scale N   RoPE frequency linear scaling factor (default: loaded from model)\n");
+    printf("  --rope-freq-scale N   RoPE frequency scaling factor, expands context by a factor of 1/N\n");
+    printf("  --yarn-orig-ctx N     YaRN: original context size of model (default: 0 = model training context size)\n");
+    printf("  --yarn-ext-factor N   YaRN: extrapolation mix factor (default: 1.0, 0.0 = full interpolation)\n");
+    printf("  --yarn-attn-factor N  YaRN: scale sqrt(t) or attention magnitude (default: 1.0)\n");
+    printf("  --yarn-beta-slow N    YaRN: high correction dim or alpha (default: %.1f)\n", params.yarn_beta_slow);
+    printf("  --yarn-beta-fast N    YaRN: low correction dim or beta (default: %.1f)\n", params.yarn_beta_fast);
     printf("  --ignore-eos          ignore end of stream token and continue generating (implies --logit-bias 2-inf)\n");
     printf("  --no-penalize-nl      do not penalize newline token\n");
     printf("  --memory-f32          use f32 instead of f16 for memory key+value (default: disabled)\n");
@@ -703,6 +779,8 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
     printf("  -np N, --parallel N   number of parallel sequences to decode (default: %d)\n", params.n_parallel);
     printf("  -ns N, --sequences N  number of sequences to decode (default: %d)\n", params.n_sequences);
     printf("  -cb, --cont-batching  enable continuous batching (a.k.a dynamic batching) (default: disabled)\n");
+    printf("  --mmproj MMPROJ_FILE  path to a multimodal projector file for LLaVA. see examples/llava/README.md\n");
+    printf("  --image IMAGE_FILE    path to an image file. use with multimodal models\n");
     if (llama_mlock_supported()) {
         printf("  --mlock               force system to keep model in RAM rather than swapping or compressing\n");
     }
@@ -727,7 +805,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
 #endif // GGML_USE_CUBLAS
 #endif
     printf("  --verbose-prompt      print prompt before generation\n");
-    fprintf(stderr, "  --simple-io           use basic IO for better compatibility in subprocesses and limited consoles\n");
+    printf("  --simple-io           use basic IO for better compatibility in subprocesses and limited consoles\n");
     printf("  --lora FNAME          apply LoRA adapter (implies --no-mmap)\n");
     printf("  --lora-scaled FNAME S apply LoRA adapter with user defined scaling S (implies --no-mmap)\n");
     printf("  --lora-base FNAME     optional model to use as a base for the layers modified by the LoRA adapter\n");
@@ -738,6 +816,9 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
     printf("  -ld LOGDIR, --logdir LOGDIR\n");
     printf("                        path under which to save YAML logs (no logging if unset)\n");
     printf("\n");
+#ifndef LOG_DISABLE_LOGS
+    log_print_usage();
+#endif // LOG_DISABLE_LOGS
 }
 
 std::string get_system_info(const gpt_params & params) {
@@ -791,21 +872,48 @@ struct llama_model_params llama_model_params_from_gpt_params(const gpt_params &
 struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params) {
     auto cparams = llama_context_default_params();
 
-    cparams.n_ctx           = params.n_ctx;
-    cparams.n_batch         = params.n_batch;
-    cparams.n_threads       = params.n_threads;
-    cparams.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
-    cparams.mul_mat_q       = params.mul_mat_q;
-    cparams.seed            = params.seed;
-    cparams.f16_kv          = params.memory_f16;
-    cparams.logits_all      = params.logits_all;
-    cparams.embedding       = params.embedding;
-    cparams.rope_freq_base  = params.rope_freq_base;
-    cparams.rope_freq_scale = params.rope_freq_scale;
+    cparams.n_ctx             = params.n_ctx;
+    cparams.n_batch           = params.n_batch;
+    cparams.n_threads         = params.n_threads;
+    cparams.n_threads_batch   = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
+    cparams.mul_mat_q         = params.mul_mat_q;
+    cparams.seed              = params.seed;
+    cparams.f16_kv            = params.memory_f16;
+    cparams.logits_all        = params.logits_all;
+    cparams.embedding         = params.embedding;
+    cparams.rope_scaling_type = params.rope_scaling_type;
+    cparams.rope_freq_base    = params.rope_freq_base;
+    cparams.rope_freq_scale   = params.rope_freq_scale;
+    cparams.yarn_ext_factor   = params.yarn_ext_factor;
+    cparams.yarn_attn_factor  = params.yarn_attn_factor;
+    cparams.yarn_beta_fast    = params.yarn_beta_fast;
+    cparams.yarn_beta_slow    = params.yarn_beta_slow;
+    cparams.yarn_orig_ctx     = params.yarn_orig_ctx;
 
     return cparams;
 }
 
+void llama_batch_clear(struct llama_batch & batch) {
+    batch.n_tokens = 0;
+}
+
+void llama_batch_add(
+                 struct llama_batch & batch,
+                        llama_token   id,
+                          llama_pos   pos,
+    const std::vector<llama_seq_id> & seq_ids,
+                               bool   logits) {
+    batch.token   [batch.n_tokens] = id;
+    batch.pos     [batch.n_tokens] = pos,
+    batch.n_seq_id[batch.n_tokens] = seq_ids.size();
+    for (size_t i = 0; i < seq_ids.size(); ++i) {
+        batch.seq_id[batch.n_tokens][i] = seq_ids[i];
+    }
+    batch.logits  [batch.n_tokens] = logits;
+
+    batch.n_tokens++;
+}
+
 std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_params(gpt_params & params) {
     auto mparams = llama_model_params_from_gpt_params(params);
 
@@ -843,15 +951,15 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
     }
 
     if (params.ignore_eos) {
-        params.sampling_params.logit_bias[llama_token_eos(lctx)] = -INFINITY;
+        params.sparams.logit_bias[llama_token_eos(model)] = -INFINITY;
     }
 
     {
         LOG("warming up the model with an empty run\n");
 
-        std::vector<llama_token> tmp = { llama_token_bos(lctx), llama_token_eos(lctx), };
+        std::vector<llama_token> tmp = { llama_token_bos(model), llama_token_eos(model), };
         llama_decode(lctx, llama_batch_get_one(tmp.data(), std::min(tmp.size(), (size_t) params.n_batch), 0, 0));
-        llama_kv_cache_tokens_rm(lctx, -1, -1);
+        llama_kv_cache_clear(lctx);
         llama_reset_timings(lctx);
     }
 
@@ -865,21 +973,23 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
 std::vector<llama_token> llama_tokenize(
   const struct llama_context * ctx,
            const std::string & text,
-                        bool   add_bos) {
-    return llama_tokenize(llama_get_model(ctx), text, add_bos);
+                        bool   add_bos,
+                        bool   special) {
+    return llama_tokenize(llama_get_model(ctx), text, add_bos, special);
 }
 
 std::vector<llama_token> llama_tokenize(
     const struct llama_model * model,
            const std::string & text,
-                        bool   add_bos) {
+                        bool   add_bos,
+                        bool   special) {
     // upper limit for the number of tokens
     int n_tokens = text.length() + add_bos;
     std::vector<llama_token> result(n_tokens);
-    n_tokens = llama_tokenize(model, text.data(), text.length(), result.data(), result.size(), add_bos);
+    n_tokens = llama_tokenize(model, text.data(), text.length(), result.data(), result.size(), add_bos, special);
     if (n_tokens < 0) {
         result.resize(-n_tokens);
-        int check = llama_tokenize(model, text.data(), text.length(), result.data(), result.size(), add_bos);
+        int check = llama_tokenize(model, text.data(), text.length(), result.data(), result.size(), add_bos, special);
         GGML_ASSERT(check == -n_tokens);
     } else {
         result.resize(n_tokens);
@@ -902,7 +1012,7 @@ std::string llama_token_to_piece(const struct llama_context * ctx, llama_token t
 }
 
 std::string llama_detokenize_spm(llama_context * ctx, const std::vector<llama_token> & tokens) {
-    const llama_token bos_id = llama_token_bos(ctx);
+    const llama_token bos_id = llama_token_bos(llama_get_model(ctx));
 
     std::string piece;
     std::string result;
@@ -1086,28 +1196,28 @@ std::string get_sortable_timestamp() {
 
 void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const llama_context * lctx,
                                const std::string & timestamp, const std::vector<int> & prompt_tokens, const char * model_desc) {
-    const llama_sampling_params & sparams = params.sampling_params;
+    const llama_sampling_params & sparams = params.sparams;
 
-    fprintf(stream, "build_commit: %s\n", BUILD_COMMIT);
-    fprintf(stream, "build_number: %d\n", BUILD_NUMBER);
-    fprintf(stream, "cpu_has_arm_fma: %s\n", ggml_cpu_has_arm_fma() ? "true" : "false");
-    fprintf(stream, "cpu_has_avx: %s\n", ggml_cpu_has_avx() ? "true" : "false");
-    fprintf(stream, "cpu_has_avx2: %s\n", ggml_cpu_has_avx2() ? "true" : "false");
-    fprintf(stream, "cpu_has_avx512: %s\n", ggml_cpu_has_avx512() ? "true" : "false");
+    fprintf(stream, "build_commit: %s\n",        LLAMA_COMMIT);
+    fprintf(stream, "build_number: %d\n",        LLAMA_BUILD_NUMBER);
+    fprintf(stream, "cpu_has_arm_fma: %s\n",     ggml_cpu_has_arm_fma()     ? "true" : "false");
+    fprintf(stream, "cpu_has_avx: %s\n",         ggml_cpu_has_avx()         ? "true" : "false");
+    fprintf(stream, "cpu_has_avx2: %s\n",        ggml_cpu_has_avx2()        ? "true" : "false");
+    fprintf(stream, "cpu_has_avx512: %s\n",      ggml_cpu_has_avx512()      ? "true" : "false");
     fprintf(stream, "cpu_has_avx512_vbmi: %s\n", ggml_cpu_has_avx512_vbmi() ? "true" : "false");
     fprintf(stream, "cpu_has_avx512_vnni: %s\n", ggml_cpu_has_avx512_vnni() ? "true" : "false");
-    fprintf(stream, "cpu_has_blas: %s\n", ggml_cpu_has_blas() ? "true" : "false");
-    fprintf(stream, "cpu_has_cublas: %s\n", ggml_cpu_has_cublas() ? "true" : "false");
-    fprintf(stream, "cpu_has_clblast: %s\n", ggml_cpu_has_clblast() ? "true" : "false");
-    fprintf(stream, "cpu_has_fma: %s\n", ggml_cpu_has_fma() ? "true" : "false");
-    fprintf(stream, "cpu_has_gpublas: %s\n", ggml_cpu_has_gpublas() ? "true" : "false");
-    fprintf(stream, "cpu_has_neon: %s\n", ggml_cpu_has_neon() ? "true" : "false");
-    fprintf(stream, "cpu_has_f16c: %s\n", ggml_cpu_has_f16c() ? "true" : "false");
-    fprintf(stream, "cpu_has_fp16_va: %s\n", ggml_cpu_has_fp16_va() ? "true" : "false");
-    fprintf(stream, "cpu_has_wasm_simd: %s\n", ggml_cpu_has_wasm_simd() ? "true" : "false");
-    fprintf(stream, "cpu_has_blas: %s\n", ggml_cpu_has_blas() ? "true" : "false");
-    fprintf(stream, "cpu_has_sse3: %s\n", ggml_cpu_has_sse3() ? "true" : "false");
-    fprintf(stream, "cpu_has_vsx: %s\n", ggml_cpu_has_vsx() ? "true" : "false");
+    fprintf(stream, "cpu_has_blas: %s\n",        ggml_cpu_has_blas()        ? "true" : "false");
+    fprintf(stream, "cpu_has_cublas: %s\n",      ggml_cpu_has_cublas()      ? "true" : "false");
+    fprintf(stream, "cpu_has_clblast: %s\n",     ggml_cpu_has_clblast()     ? "true" : "false");
+    fprintf(stream, "cpu_has_fma: %s\n",         ggml_cpu_has_fma()         ? "true" : "false");
+    fprintf(stream, "cpu_has_gpublas: %s\n",     ggml_cpu_has_gpublas()     ? "true" : "false");
+    fprintf(stream, "cpu_has_neon: %s\n",        ggml_cpu_has_neon()        ? "true" : "false");
+    fprintf(stream, "cpu_has_f16c: %s\n",        ggml_cpu_has_f16c()        ? "true" : "false");
+    fprintf(stream, "cpu_has_fp16_va: %s\n",     ggml_cpu_has_fp16_va()     ? "true" : "false");
+    fprintf(stream, "cpu_has_wasm_simd: %s\n",   ggml_cpu_has_wasm_simd()   ? "true" : "false");
+    fprintf(stream, "cpu_has_blas: %s\n",        ggml_cpu_has_blas()        ? "true" : "false");
+    fprintf(stream, "cpu_has_sse3: %s\n",        ggml_cpu_has_sse3()        ? "true" : "false");
+    fprintf(stream, "cpu_has_vsx: %s\n",         ggml_cpu_has_vsx()         ? "true" : "false");
 
 #ifdef NDEBUG
     fprintf(stream, "debug: false\n");
@@ -1141,13 +1251,13 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
     fprintf(stream, "ctx_size: %d # default: 512\n", params.n_ctx);
     fprintf(stream, "escape: %s # default: false\n", params.escape ? "true" : "false");
     fprintf(stream, "file: # never logged, see prompt instead. Can still be specified for input.\n");
-    fprintf(stream, "frequency_penalty: %f # default: 0.0 \n", sparams.frequency_penalty);
-    dump_string_yaml_multiline(stream, "grammar", params.grammar.c_str());
+    fprintf(stream, "frequency_penalty: %f # default: 0.0 \n", sparams.penalty_freq);
+    dump_string_yaml_multiline(stream, "grammar", sparams.grammar.c_str());
     fprintf(stream, "grammar-file: # never logged, see grammar instead. Can still be specified for input.\n");
     fprintf(stream, "hellaswag: %s # default: false\n", params.hellaswag ? "true" : "false");
     fprintf(stream, "hellaswag_tasks: %zu # default: 400\n", params.hellaswag_tasks);
 
-    const auto logit_bias_eos = sparams.logit_bias.find(llama_token_eos(lctx));
+    const auto logit_bias_eos = sparams.logit_bias.find(llama_token_eos(llama_get_model(lctx)));
     const bool ignore_eos = logit_bias_eos != sparams.logit_bias.end() && logit_bias_eos->second == -INFINITY;
     fprintf(stream, "ignore_eos: %s # default: false\n", ignore_eos ? "true" : "false");
 
@@ -1201,14 +1311,14 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
     fprintf(stream, "numa: %s # default: false\n", params.numa ? "true" : "false");
     fprintf(stream, "ppl_output_type: %d # default: 0\n", params.ppl_output_type);
     fprintf(stream, "ppl_stride: %d # default: 0\n", params.ppl_stride);
-    fprintf(stream, "presence_penalty: %f # default: 0.0\n", sparams.presence_penalty);
+    fprintf(stream, "presence_penalty: %f # default: 0.0\n", sparams.penalty_present);
     dump_string_yaml_multiline(stream, "prompt", params.prompt.c_str());
     fprintf(stream, "prompt_cache: %s\n", params.path_prompt_cache.c_str());
     fprintf(stream, "prompt_cache_all: %s # default: false\n", params.prompt_cache_all ? "true" : "false");
     fprintf(stream, "prompt_cache_ro: %s # default: false\n", params.prompt_cache_ro ? "true" : "false");
     dump_vector_int_yaml(stream, "prompt_tokens", prompt_tokens);
     fprintf(stream, "random_prompt: %s # default: false\n", params.random_prompt ? "true" : "false");
-    fprintf(stream, "repeat_penalty: %f # default: 1.1\n", sparams.repeat_penalty);
+    fprintf(stream, "repeat_penalty: %f # default: 1.1\n", sparams.penalty_repeat);
 
     fprintf(stream, "reverse_prompt:\n");
     for (std::string ap : params.antiprompt) {
@@ -1235,6 +1345,7 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
     fprintf(stream, "threads: %d # default: %d\n", params.n_threads, std::thread::hardware_concurrency());
     fprintf(stream, "top_k: %d # default: 40\n", sparams.top_k);
     fprintf(stream, "top_p: %f # default: 0.95\n", sparams.top_p);
+    fprintf(stream, "min_p: %f # default: 0.0\n", sparams.min_p);
     fprintf(stream, "typical_p: %f # default: 1.0\n", sparams.typical_p);
     fprintf(stream, "verbose_prompt: %s # default: false\n", params.verbose_prompt ? "true" : "false");
 }
diff --git a/common/common.h b/common/common.h
index fa115536b..72a49b890 100644
--- a/common/common.h
+++ b/common/common.h
@@ -9,6 +9,7 @@
 #define LOG_NO_FILE_LINE_FUNCTION
 #include "log.h"
 
+#include <cmath>
 #include <string>
 #include <vector>
 #include <random>
@@ -25,11 +26,17 @@
 #define die(msg)          do { fputs("error: " msg "\n", stderr);                exit(1); } while (0)
 #define die_fmt(fmt, ...) do { fprintf(stderr, "error: " fmt "\n", __VA_ARGS__); exit(1); } while (0)
 
-#define print_build_info() do {                                                             \
-    fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT);         \
-    fprintf(stderr, "%s: built with %s for %s\n", __func__, BUILD_COMPILER, BUILD_TARGET);  \
+#define print_build_info() do {                                                                     \
+    fprintf(stderr, "%s: build = %d (%s)\n", __func__, LLAMA_BUILD_NUMBER, LLAMA_COMMIT);           \
+    fprintf(stderr, "%s: built with %s for %s\n", __func__, LLAMA_COMPILER, LLAMA_BUILD_TARGET);    \
 } while(0)
 
+// build info
+extern int LLAMA_BUILD_NUMBER;
+extern char const *LLAMA_COMMIT;
+extern char const *LLAMA_COMPILER;
+extern char const *LLAMA_BUILD_TARGET;
+
 //
 // CLI argument parsing
 //
@@ -54,9 +61,15 @@ struct gpt_params {
     int32_t n_beams                         = 0;    // if non-zero then use beam search of given width.
     float   rope_freq_base                  = 0.0f; // RoPE base frequency
     float   rope_freq_scale                 = 0.0f; // RoPE frequency scaling factor
+    float   yarn_ext_factor                 = NAN;  // YaRN extrapolation mix factor
+    float   yarn_attn_factor                = 1.0f; // YaRN magnitude scaling factor
+    float   yarn_beta_fast                  = 32.0f;// YaRN low correction dim
+    float   yarn_beta_slow                  = 1.0f; // YaRN high correction dim
+    int32_t yarn_orig_ctx                   = 0;    // YaRN original context length
+    int8_t  rope_scaling_type               = LLAMA_ROPE_SCALING_UNSPECIFIED;
 
     // // sampling parameters
-    struct llama_sampling_params sampling_params;
+    struct llama_sampling_params sparams;
 
     std::string model             = "models/7B/ggml-model-f16.gguf"; // model path
     std::string model_draft       = "";                              // draft model for speculative decoding
@@ -66,10 +79,10 @@ struct gpt_params {
     std::string path_prompt_cache = "";  // path to file for saving/loading prompt eval state
     std::string input_prefix      = "";  // string to prefix user inputs with
     std::string input_suffix      = "";  // string to suffix user inputs with
-    std::string grammar           = "";  // optional BNF-like grammar to constrain sampling
     std::vector<std::string> antiprompt; // string upon seeing which more user input is prompted
     std::string logdir            = "";  // directory in which to save YAML log files
 
+    // TODO: avoid tuple, use struct
     std::vector<std::tuple<std::string, float>> lora_adapter; // lora adapter path with user defined scale
     std::string lora_base  = "";                              // base model path for the lora adapter
 
@@ -104,8 +117,14 @@ struct gpt_params {
     bool numa              = false; // attempt optimizations that help on some NUMA systems
     bool verbose_prompt    = false; // print prompt tokens before generation
     bool infill            = false; // use infill mode
+
+    // multimodal models (see examples/llava)
+    std::string mmproj = ""; // path to multimodal projector
+    std::string image = ""; // path to an image file
 };
 
+bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params);
+
 bool gpt_params_parse(int argc, char ** argv, gpt_params & params);
 
 void gpt_print_usage(int argc, char ** argv, const gpt_params & params);
@@ -120,10 +139,23 @@ void process_escapes(std::string& input);
 // Model utils
 //
 
+// TODO: avoid tuplue, use struct
 std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_params(gpt_params & params);
-struct llama_model_params   llama_model_params_from_gpt_params(const gpt_params & params);
+
+struct llama_model_params   llama_model_params_from_gpt_params  (const gpt_params & params);
 struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params);
 
+// Batch utils
+
+void llama_batch_clear(struct llama_batch & batch);
+
+void llama_batch_add(
+                 struct llama_batch & batch,
+                        llama_token   id,
+                          llama_pos   pos,
+    const std::vector<llama_seq_id> & seq_ids,
+                               bool   logits);
+
 //
 // Vocab utils
 //
@@ -133,12 +165,14 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
 std::vector<llama_token> llama_tokenize(
   const struct llama_context * ctx,
            const std::string & text,
-                        bool   add_bos);
+                        bool   add_bos,
+                        bool   special = false);
 
 std::vector<llama_token> llama_tokenize(
     const struct llama_model * model,
            const std::string & text,
-                        bool   add_bos);
+                        bool   add_bos,
+                        bool   special = false);
 
 // tokenizes a token into a piece
 // should work similar to Python's `tokenizer.id_to_piece`
diff --git a/common/grammar-parser.cpp b/common/grammar-parser.cpp
index 5a545a807..ff51cc803 100644
--- a/common/grammar-parser.cpp
+++ b/common/grammar-parser.cpp
@@ -399,7 +399,7 @@ namespace grammar_parser {
     void print_grammar(FILE * file, const parse_state & state) {
         try {
             std::map<uint32_t, std::string> symbol_id_names;
-            for (auto kv : state.symbol_ids) {
+            for (const auto & kv : state.symbol_ids) {
                 symbol_id_names[kv.second] = kv.first;
             }
             for (size_t i = 0, end = state.rules.size(); i < end; i++) {
diff --git a/common/log.h b/common/log.h
index b8953fdca..c0e814861 100644
--- a/common/log.h
+++ b/common/log.h
@@ -97,37 +97,56 @@
     #define LOG_TEE_TARGET stderr
 #endif
 
+// Utility for synchronizing log configuration state
+//  since std::optional was introduced only in c++17
+enum LogTriState
+{
+    LogTriStateSame,
+    LogTriStateFalse,
+    LogTriStateTrue
+};
+
 // Utility to obtain "pid" like unique process id and use it when creating log files.
 inline std::string log_get_pid()
 {
-    static std::string pid;
-    if (pid.empty())
-    {
-        // std::this_thread::get_id() is the most portable way of obtaining a "process id"
-        //  it's not the same as "pid" but is unique enough to solve multiple instances
-        //  trying to write to the same log.
-        std::stringstream ss;
-        ss << std::this_thread::get_id();
-        pid = ss.str();
-    }
+   static std::string pid;
+   if (pid.empty())
+   {
+       // std::this_thread::get_id() is the most portable way of obtaining a "process id"
+       //  it's not the same as "pid" but is unique enough to solve multiple instances
+       //  trying to write to the same log.
+       std::stringstream ss;
+       ss << std::this_thread::get_id();
+       pid = ss.str();
+   }
 
-    return pid;
+   return pid;
 }
 
 // Utility function for generating log file names with unique id based on thread id.
 //  invocation with log_filename_generator( "llama", "log" ) creates a string "llama.<number>.log"
 //  where the number is a runtime id of the current thread.
 
-#define log_filename_generator(log_file_basename, log_file_extension) log_filename_generator_impl(log_file_basename, log_file_extension)
+#define log_filename_generator(log_file_basename, log_file_extension) log_filename_generator_impl(LogTriStateSame, log_file_basename, log_file_extension)
 
 // INTERNAL, DO NOT USE
-inline std::string log_filename_generator_impl(const std::string & log_file_basename, const std::string & log_file_extension)
+inline std::string log_filename_generator_impl(LogTriState multilog, const std::string & log_file_basename, const std::string & log_file_extension)
 {
+    static bool _multilog = false;
+
+    if (multilog != LogTriStateSame)
+    {
+        _multilog = multilog == LogTriStateTrue;
+    }
+
     std::stringstream buf;
 
     buf << log_file_basename;
-    buf << ".";
-    buf << log_get_pid();
+    if (_multilog)
+    {
+        buf << ".";
+        buf << log_get_pid();
+    }
     buf << ".";
     buf << log_file_extension;
 
@@ -212,15 +231,6 @@ inline std::string log_filename_generator_impl(const std::string & log_file_base
     #define LOG_TEE_FLF_VAL ,""
 #endif
 
-// Utility for synchronizing log configuration state
-//  since std::optional was introduced only in c++17
-enum LogTriState
-{
-    LogTriStateSame,
-    LogTriStateFalse,
-    LogTriStateTrue
-};
-
 // INTERNAL, DO NOT USE
 //  USE LOG() INSTEAD
 //
@@ -314,16 +324,23 @@ enum LogTriState
 #endif
 
 // INTERNAL, DO NOT USE
-inline FILE *log_handler1_impl(bool change = false, LogTriState disable = LogTriStateSame, const std::string & filename = LOG_DEFAULT_FILE_NAME, FILE *target = nullptr)
+inline FILE *log_handler1_impl(bool change = false, LogTriState append = LogTriStateSame, LogTriState disable = LogTriStateSame, const std::string & filename = LOG_DEFAULT_FILE_NAME, FILE *target = nullptr)
 {
-    static bool _initialized{false};
-    static bool _disabled{(filename.empty() && target == nullptr)};
+    static bool _initialized = false;
+    static bool _append = false;
+    static bool _disabled = filename.empty() && target == nullptr;
     static std::string log_current_filename{filename};
     static FILE *log_current_target{target};
     static FILE *logfile = nullptr;
 
     if (change)
     {
+        if (append != LogTriStateSame)
+        {
+            _append = append == LogTriStateTrue;
+            return logfile;
+        }
+
         if (disable == LogTriStateTrue)
         {
             // Disable primary target
@@ -376,7 +393,7 @@ inline FILE *log_handler1_impl(bool change = false, LogTriState disable = LogTri
             }
         }
 
-        logfile = fopen(filename.c_str(), "w");
+        logfile = fopen(filename.c_str(), _append ? "a" : "w");
     }
 
     if (!logfile)
@@ -397,9 +414,9 @@ inline FILE *log_handler1_impl(bool change = false, LogTriState disable = LogTri
 }
 
 // INTERNAL, DO NOT USE
-inline FILE *log_handler2_impl(bool change = false, LogTriState disable = LogTriStateSame, FILE *target = nullptr, const std::string & filename = LOG_DEFAULT_FILE_NAME)
+inline FILE *log_handler2_impl(bool change = false, LogTriState append = LogTriStateSame, LogTriState disable = LogTriStateSame, FILE *target = nullptr, const std::string & filename = LOG_DEFAULT_FILE_NAME)
 {
-    return log_handler1_impl(change, disable, filename, target);
+    return log_handler1_impl(change, append, disable, filename, target);
 }
 
 // Disables logs entirely at runtime.
@@ -410,7 +427,7 @@ inline FILE *log_handler2_impl(bool change = false, LogTriState disable = LogTri
 // INTERNAL, DO NOT USE
 inline FILE *log_disable_impl()
 {
-    return log_handler1_impl(true, LogTriStateTrue);
+    return log_handler1_impl(true, LogTriStateSame, LogTriStateTrue);
 }
 
 // Enables logs at runtime.
@@ -419,19 +436,31 @@ inline FILE *log_disable_impl()
 // INTERNAL, DO NOT USE
 inline FILE *log_enable_impl()
 {
-    return log_handler1_impl(true, LogTriStateFalse);
+    return log_handler1_impl(true, LogTriStateSame, LogTriStateFalse);
 }
 
 // Sets target fir logs, either by a file name or FILE* pointer (stdout, stderr, or any valid FILE*)
 #define log_set_target(target) log_set_target_impl(target)
 
 // INTERNAL, DO NOT USE
-inline FILE *log_set_target_impl(const std::string & filename) { return log_handler1_impl(true, LogTriStateSame, filename); }
-inline FILE *log_set_target_impl(FILE *target) { return log_handler2_impl(true, LogTriStateSame, target); }
+inline FILE *log_set_target_impl(const std::string & filename) { return log_handler1_impl(true, LogTriStateSame, LogTriStateSame, filename); }
+inline FILE *log_set_target_impl(FILE *target) { return log_handler2_impl(true, LogTriStateSame, LogTriStateSame, target); }
 
 // INTERNAL, DO NOT USE
 inline FILE *log_handler() { return log_handler1_impl(); }
 
+// Enable or disable creating separate log files for each run.
+//  can ONLY be invoked BEFORE first log use.
+#define log_multilog(enable) log_filename_generator_impl((enable) ? LogTriStateTrue : LogTriStateFalse, "", "")
+// Enable or disable append mode for log file.
+//  can ONLY be invoked BEFORE first log use.
+#define log_append(enable) log_append_impl(enable)
+// INTERNAL, DO NOT USE
+inline FILE *log_append_impl(bool enable)
+{
+    return log_handler1_impl(true, enable ? LogTriStateTrue : LogTriStateFalse, LogTriStateSame);
+}
+
 inline void log_test()
 {
     log_disable();
@@ -493,6 +522,18 @@ inline bool log_param_single_parse(const std::string & param)
         return true;
     }
 
+    if (param == "--log-new")
+    {
+        log_multilog(true);
+        return true;
+    }
+
+    if (param == "--log-append")
+    {
+        log_append(true);
+        return true;
+    }
+
     return false;
 }
 
@@ -522,7 +563,9 @@ inline void log_print_usage()
     printf("  --log-disable         Disable trace logs\n");
     printf("  --log-enable          Enable trace logs\n");
     printf("  --log-file            Specify a log filename (without extension)\n");
-    printf("                        Log file will be tagged with unique ID and written as \"<name>.<ID>.log\"\n"); /*  */
+    printf("  --log-new             Create a separate new log file on start. "
+                                   "Each log file will have unique name: \"<name>.<ID>.log\"\n");
+    printf("  --log-append          Don't truncate the old log file.\n");
 }
 
 #define log_dump_cmdline(argc, argv) log_dump_cmdline_impl(argc, argv)
@@ -579,38 +622,75 @@ inline std::string log_var_to_string_impl(const std::vector<int> & var)
     return buf.str();
 }
 
-#define LOG_TOKENS_TOSTR_PRETTY(ctx, tokens)                                 \
-    [&tokens, &ctx]()                                                        \
-    {                                                                        \
-        std::stringstream buf;                                               \
-        buf << "[ ";                                                         \
-                                                                             \
-        bool first = true;                                                   \
-        for (const auto &token : tokens)                                     \
-        {                                                                    \
-            if (!first)                                                      \
-                buf << ", ";                                                 \
-            else                                                             \
-                first = false;                                               \
-                                                                             \
-            auto detokenized = llama_token_to_piece(ctx, token);             \
-                                                                             \
-            detokenized.erase(                                               \
-                std::remove_if(                                              \
-                    detokenized.begin(),                                     \
-                    detokenized.end(),                                       \
-                    [](const unsigned char c) { return !std::isprint(c); }), \
-                detokenized.end());                                          \
-                                                                             \
-            buf                                                              \
-                << "'" << detokenized << "'"                                 \
-                << ":" << std::to_string(token);                             \
-        }                                                                    \
-        buf << " ]";                                                         \
-                                                                             \
-        return buf.str();                                                    \
-    }()                                                                      \
-        .c_str()
+template <typename C, typename T>
+inline std::string LOG_TOKENS_TOSTR_PRETTY(const C & ctx, const T & tokens)
+{
+    std::stringstream buf;
+    buf << "[ ";
+
+    bool first = true;
+    for (const auto &token : tokens)
+    {
+        if (!first) {
+            buf << ", ";
+        } else {
+            first = false;
+        }
+
+        auto detokenized = llama_token_to_piece(ctx, token);
+
+        detokenized.erase(
+            std::remove_if(
+                detokenized.begin(),
+                detokenized.end(),
+                [](const unsigned char c) { return !std::isprint(c); }),
+            detokenized.end());
+
+        buf
+            << "'" << detokenized << "'"
+            << ":" << std::to_string(token);
+    }
+    buf << " ]";
+
+    return buf.str();
+}
+
+template <typename C, typename B>
+inline std::string LOG_BATCH_TOSTR_PRETTY(const C & ctx, const B & batch)
+{
+    std::stringstream buf;
+    buf << "[ ";
+
+    bool first = true;
+    for (int i = 0; i < batch.n_tokens; ++i)
+    {
+        if (!first) {
+            buf << ", ";
+        } else {
+            first = false;
+        }
+
+        auto detokenized = llama_token_to_piece(ctx, batch.token[i]);
+
+        detokenized.erase(
+            std::remove_if(
+                detokenized.begin(),
+                detokenized.end(),
+                [](const unsigned char c) { return !std::isprint(c); }),
+            detokenized.end());
+
+        buf
+            << "\n" << std::to_string(i)
+            << ":token '" << detokenized << "'"
+            << ":pos " << std::to_string(batch.pos[i])
+            << ":n_seq_id  " << std::to_string(batch.n_seq_id[i])
+            << ":seq_id " << std::to_string(batch.seq_id[i][0])
+            << ":logits " << std::to_string(batch.logits[i]);
+    }
+    buf << " ]";
+
+    return buf.str();
+}
 
 #ifdef LOG_DISABLE_LOGS
 
diff --git a/common/sampling.cpp b/common/sampling.cpp
index 8ce419459..1317024c2 100644
--- a/common/sampling.cpp
+++ b/common/sampling.cpp
@@ -1,113 +1,163 @@
 #include "sampling.h"
 
-llama_sampling_context::~llama_sampling_context() {
-    for (auto & it : sequence_contexts) {
-        if (it.second.grammar != NULL) {
-            llama_grammar_free(it.second.grammar);
-            it.second.grammar = NULL;
+struct llama_sampling_context * llama_sampling_init(const struct llama_sampling_params & params) {
+    struct llama_sampling_context * result = new llama_sampling_context();
+
+    result->params  = params;
+    result->grammar = nullptr;
+
+    // if there is a grammar, parse it
+    if (!params.grammar.empty()) {
+        result->parsed_grammar = grammar_parser::parse(params.grammar.c_str());
+
+        // will be empty (default) if there are parse errors
+        if (result->parsed_grammar.rules.empty()) {
+            fprintf(stderr, "%s: failed to parse grammar\n", __func__);
+            return nullptr;
         }
+
+        std::vector<const llama_grammar_element *> grammar_rules(result->parsed_grammar.c_rules());
+
+        result->grammar = llama_grammar_init(
+                grammar_rules.data(),
+                grammar_rules.size(), result->parsed_grammar.symbol_ids.at("root"));
     }
+
+    result->prev.resize(params.n_prev);
+
+    return result;
 }
 
-llama_sampling_context llama_sampling_context_init(
-        const struct gpt_params & params,
-                  llama_grammar * grammar) {
-  llama_sampling_context result;
+void llama_sampling_free(struct llama_sampling_context * ctx) {
+    if (ctx->grammar != NULL) {
+        llama_grammar_free(ctx->grammar);
+    }
 
-  result.params = params.sampling_params;
-  result.grammar = grammar;
-  return result;
+    delete ctx;
 }
 
-// Note: Creates the context if it doesn't exist, so this always return something.
-llama_sampler_sequence_context & llama_sampling_get_sequence_context(
-              llama_sampling_context & ctx_sampling,
-        const llama_seq_id             seq) {
-    const auto it = ctx_sampling.sequence_contexts.find(seq);
-    if (it != ctx_sampling.sequence_contexts.end()) {
-        return it->second;
+void llama_sampling_reset(llama_sampling_context * ctx) {
+    if (ctx->grammar != NULL) {
+        llama_grammar_free(ctx->grammar);
+        ctx->grammar = NULL;
     }
-    llama_sampler_sequence_context new_ctx = {
-        2.0f * ctx_sampling.params.mirostat_tau,
-        ctx_sampling.grammar != NULL ? llama_grammar_copy(ctx_sampling.grammar) : NULL,
-    };
-    return ctx_sampling.sequence_contexts.insert({seq, new_ctx}).first->second;
+
+    if (!ctx->parsed_grammar.rules.empty()) {
+        std::vector<const llama_grammar_element *> grammar_rules(ctx->parsed_grammar.c_rules());
+
+        ctx->grammar = llama_grammar_init(
+                grammar_rules.data(),
+                grammar_rules.size(), ctx->parsed_grammar.symbol_ids.at("root"));
+    }
+
+    std::fill(ctx->prev.begin(), ctx->prev.end(), 0);
+    ctx->cur.clear();
 }
 
-bool llama_sampling_context_reset(
-              llama_sampling_context & ctx_sampling,
-        const llama_seq_id             seq) {
-    const auto it = ctx_sampling.sequence_contexts.find(seq);
-    if (it == ctx_sampling.sequence_contexts.end()) return false;
-    if (it->second.grammar != NULL) {
-        llama_grammar_free(it->second.grammar);
-        it->second.grammar = NULL;
+void llama_sampling_cp(llama_sampling_context * src, llama_sampling_context * dst) {
+    if (dst->grammar) {
+        llama_grammar_free(dst->grammar);
+        dst->grammar = nullptr;
     }
-    ctx_sampling.sequence_contexts.erase(it);
-    return true;
+
+    if (src->grammar) {
+        dst->grammar = llama_grammar_copy(src->grammar);
+    }
+
+    dst->prev = src->prev;
+}
+
+llama_token llama_sampling_last(llama_sampling_context * ctx) {
+    return ctx->prev.back();
+}
+
+std::string llama_sampling_prev_str(llama_sampling_context * ctx_sampling, llama_context * ctx_main, int n) {
+    const int size = ctx_sampling->prev.size();
+
+    n = std::min(n, size);
+
+    std::string result;
+
+    for (int i = size - n; i < size; i++) {
+        result += llama_token_to_piece(ctx_main, ctx_sampling->prev[i]);
+    }
+
+    return result;
+}
+
+std::string llama_sampling_print(const llama_sampling_params & params) {
+    char result[1024];
+
+    snprintf(result, sizeof(result),
+            "\trepeat_last_n = %d, repeat_penalty = %.3f, frequency_penalty = %.3f, presence_penalty = %.3f\n"
+            "\ttop_k = %d, tfs_z = %.3f, top_p = %.3f, min_p = %.3f, typical_p = %.3f, temp = %.3f\n"
+            "\tmirostat = %d, mirostat_lr = %.3f, mirostat_ent = %.3f",
+            params.penalty_last_n, params.penalty_repeat, params.penalty_freq, params.penalty_present,
+            params.top_k, params.tfs_z, params.top_p, params.min_p, params.typical_p, params.temp,
+            params.mirostat, params.mirostat_eta, params.mirostat_tau);
+
+    return std::string(result);
 }
 
 llama_token llama_sampling_sample(
-                  struct llama_context * ctx,
-                  struct llama_context * ctx_guidance,
-                  struct llama_sampling_context & ctx_sampling,
-        const std::vector<llama_token> & last_tokens,
-         std::vector<llama_token_data> & candidates,
-        const                      int   idx,
-                          llama_seq_id   seq) {
-    const int n_ctx   = llama_n_ctx(ctx);
-    const int n_vocab = llama_n_vocab(llama_get_model(ctx));
+                  struct llama_sampling_context * ctx_sampling,
+                  struct llama_context * ctx_main,
+                  struct llama_context * ctx_cfg,
+                  const int idx) {
+    const llama_sampling_params & params = ctx_sampling->params;
+
+    const int n_vocab = llama_n_vocab(llama_get_model(ctx_main));
 
-    const llama_sampling_params & params = ctx_sampling.params;
     const float   temp            = params.temp;
     const int32_t top_k           = params.top_k <= 0 ? n_vocab : params.top_k;
     const float   top_p           = params.top_p;
+    const float   min_p           = params.min_p;
     const float   tfs_z           = params.tfs_z;
     const float   typical_p       = params.typical_p;
-    const int32_t repeat_last_n   = params.repeat_last_n < 0 ? n_ctx : params.repeat_last_n;
-    const float   repeat_penalty  = params.repeat_penalty;
-    const float   alpha_presence  = params.presence_penalty;
-    const float   alpha_frequency = params.frequency_penalty;
+    const int32_t penalty_last_n  = params.penalty_last_n < 0 ? params.n_prev : params.penalty_last_n;
+    const float   penalty_repeat  = params.penalty_repeat;
+    const float   penalty_freq    = params.penalty_freq;
+    const float   penalty_present = params.penalty_present;
     const int     mirostat        = params.mirostat;
     const float   mirostat_tau    = params.mirostat_tau;
     const float   mirostat_eta    = params.mirostat_eta;
     const bool    penalize_nl     = params.penalize_nl;
 
+    auto & prev = ctx_sampling->prev;
+    auto & cur  = ctx_sampling->cur;
+
     llama_token id = 0;
 
-    float * logits = llama_get_logits_ith(ctx, idx);
+    float * logits = llama_get_logits_ith(ctx_main, idx);
 
-    // Apply params.logit_bias map
+    // apply params.logit_bias map
     for (auto it = params.logit_bias.begin(); it != params.logit_bias.end(); it++) {
         logits[it->first] += it->second;
     }
 
-    candidates.clear();
+    cur.clear();
+
     for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
-        candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
+        cur.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
     }
 
-    llama_token_data_array cur_p = { candidates.data(), candidates.size(), false };
+    llama_token_data_array cur_p = { cur.data(), cur.size(), false };
 
-    if (ctx_guidance) {
-        llama_sample_classifier_free_guidance(ctx, &cur_p, ctx_guidance, params.cfg_scale);
+    if (ctx_cfg) {
+        llama_sample_classifier_free_guidance(ctx_main, &cur_p, ctx_cfg, params.cfg_scale);
     }
 
     // apply penalties
-    if (!last_tokens.empty()) {
-        const float nl_logit = logits[llama_token_nl(ctx)];
-        const int last_n_repeat = std::min(std::min((int)last_tokens.size(), repeat_last_n), n_ctx);
+    if (!prev.empty()) {
+        const float nl_logit = logits[llama_token_nl(llama_get_model(ctx_main))];
 
-        llama_sample_repetition_penalty(ctx, &cur_p,
-                last_tokens.data() + last_tokens.size() - last_n_repeat,
-                last_n_repeat, repeat_penalty);
-        llama_sample_frequency_and_presence_penalties(ctx, &cur_p,
-                last_tokens.data() + last_tokens.size() - last_n_repeat,
-                last_n_repeat, alpha_frequency, alpha_presence);
+        llama_sample_repetition_penalties(ctx_main, &cur_p,
+                prev.data() + prev.size() - penalty_last_n,
+                penalty_last_n, penalty_repeat, penalty_freq, penalty_present);
 
         if (!penalize_nl) {
             for (size_t idx = 0; idx < cur_p.size; idx++) {
-                if (cur_p.data[idx].id == llama_token_nl(ctx)) {
+                if (cur_p.data[idx].id == llama_token_nl(llama_get_model(ctx_main))) {
                     cur_p.data[idx].logit = nl_logit;
                     break;
                 }
@@ -115,52 +165,65 @@ llama_token llama_sampling_sample(
         }
     }
 
-    llama_sampler_sequence_context & ctx_seq = llama_sampling_get_sequence_context(ctx_sampling, seq);
-
-    if (ctx_seq.grammar != NULL) {
-        llama_sample_grammar(ctx, &cur_p, ctx_seq.grammar);
+    if (ctx_sampling->grammar != NULL) {
+        llama_sample_grammar(ctx_main, &cur_p, ctx_sampling->grammar);
     }
 
-    if (temp <= 0) {
-        // Greedy sampling
-        id = llama_sample_token_greedy(ctx, &cur_p);
+    if (temp < 0.0) {
+        // greedy sampling, with probs
+        llama_sample_softmax(ctx_main, &cur_p);
+        id = cur_p.data[0].id;
+    } else if (temp == 0.0) {
+        // greedy sampling, no probs
+        id = llama_sample_token_greedy(ctx_main, &cur_p);
     } else {
         if (mirostat == 1) {
             const int mirostat_m = 100;
-            llama_sample_temp(ctx, &cur_p, temp);
-            id = llama_sample_token_mirostat(ctx, &cur_p, mirostat_tau, mirostat_eta, mirostat_m, &ctx_seq.mirostat_mu);
+            llama_sample_temp(ctx_main, &cur_p, temp);
+            id = llama_sample_token_mirostat(ctx_main, &cur_p, mirostat_tau, mirostat_eta, mirostat_m, &ctx_sampling->mirostat_mu);
         } else if (mirostat == 2) {
-            llama_sample_temp(ctx, &cur_p, temp);
-            id = llama_sample_token_mirostat_v2(ctx, &cur_p, mirostat_tau, mirostat_eta, &ctx_seq.mirostat_mu);
+            llama_sample_temp(ctx_main, &cur_p, temp);
+            id = llama_sample_token_mirostat_v2(ctx_main, &cur_p, mirostat_tau, mirostat_eta, &ctx_sampling->mirostat_mu);
         } else {
-            // Temperature sampling
+            // temperature sampling
             size_t min_keep = std::max(1, params.n_probs);
-            llama_sample_top_k      (ctx, &cur_p, top_k, min_keep);
-            llama_sample_tail_free  (ctx, &cur_p, tfs_z, min_keep);
-            llama_sample_typical    (ctx, &cur_p, typical_p, min_keep);
-            llama_sample_top_p      (ctx, &cur_p, top_p, min_keep);
-            llama_sample_temp(ctx, &cur_p, temp);
 
-            {
-                const int n_top = 10;
-                LOG("top %d candidates:\n", n_top);
+            llama_sample_top_k    (ctx_main, &cur_p, top_k,     min_keep);
+            llama_sample_tail_free(ctx_main, &cur_p, tfs_z,     min_keep);
+            llama_sample_typical  (ctx_main, &cur_p, typical_p, min_keep);
+            llama_sample_top_p    (ctx_main, &cur_p, top_p,     min_keep);
+            llama_sample_min_p    (ctx_main, &cur_p, min_p,     min_keep);
+            llama_sample_temp     (ctx_main, &cur_p, temp);
 
-                for (int i = 0; i < n_top; i++) {
-                    const llama_token id = cur_p.data[i].id;
-                    (void)id; // To avoid a warning that id is unused when logging is disabled.
-                    LOG(" - %5d: '%12s' (%.3f)\n", id, llama_token_to_piece(ctx, id).c_str(), cur_p.data[i].p);
-                }
-            }
+            id = llama_sample_token(ctx_main, &cur_p);
 
-            id = llama_sample_token(ctx, &cur_p);
+            //{
+            //    const int n_top = 10;
+            //    LOG("top %d candidates:\n", n_top);
 
-            LOG("sampled token: %5d: '%s'\n", id, llama_token_to_piece(ctx, id).c_str());
+            //    for (int i = 0; i < n_top; i++) {
+            //        const llama_token id = cur_p.data[i].id;
+            //        (void)id; // To avoid a warning that id is unused when logging is disabled.
+            //        LOG(" - %5d: '%12s' (%.3f)\n", id, llama_token_to_piece(ctx_main, id).c_str(), cur_p.data[i].p);
+            //    }
+            //}
+
+            LOG("sampled token: %5d: '%s'\n", id, llama_token_to_piece(ctx_main, id).c_str());
         }
     }
 
-    if (ctx_seq.grammar != NULL) {
-        llama_grammar_accept_token(ctx, ctx_seq.grammar, id);
-    }
-
     return id;
 }
+
+void llama_sampling_accept(
+        struct llama_sampling_context * ctx_sampling,
+        struct llama_context * ctx_main,
+        llama_token id,
+        bool apply_grammar) {
+    ctx_sampling->prev.erase(ctx_sampling->prev.begin());
+    ctx_sampling->prev.push_back(id);
+
+    if (ctx_sampling->grammar != NULL && apply_grammar) {
+        llama_grammar_accept_token(ctx_main, ctx_sampling->grammar, id);
+    }
+}
diff --git a/common/sampling.h b/common/sampling.h
index 0aab5d03c..7c9b8dcf2 100644
--- a/common/sampling.h
+++ b/common/sampling.h
@@ -2,107 +2,109 @@
 
 #include "llama.h"
 
+#include "grammar-parser.h"
+
 #include <string>
 #include <vector>
 #include <unordered_map>
 
 // sampling parameters
 typedef struct llama_sampling_params {
+    int32_t n_prev            = 64;    // number of previous tokens to remember
+    int32_t n_probs           = 0;     // if greater than 0, output the probabilities of top n_probs tokens.
     int32_t top_k             = 40;    // <= 0 to use vocab size
     float   top_p             = 0.95f; // 1.0 = disabled
+    float   min_p             = 0.05f; // 0.0 = disabled
     float   tfs_z             = 1.00f; // 1.0 = disabled
     float   typical_p         = 1.00f; // 1.0 = disabled
     float   temp              = 0.80f; // 1.0 = disabled
-    float   repeat_penalty    = 1.10f; // 1.0 = disabled
-    int32_t repeat_last_n     = 64;    // last n tokens to penalize (0 = disable penalty, -1 = context size)
-    float   frequency_penalty = 0.00f; // 0.0 = disabled
-    float   presence_penalty  = 0.00f; // 0.0 = disabled
+    int32_t penalty_last_n    = 64;    // last n tokens to penalize (0 = disable penalty, -1 = context size)
+    float   penalty_repeat    = 1.10f; // 1.0 = disabled
+    float   penalty_freq      = 0.00f; // 0.0 = disabled
+    float   penalty_present   = 0.00f; // 0.0 = disabled
     int32_t mirostat          = 0;     // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
     float   mirostat_tau      = 5.00f; // target entropy
     float   mirostat_eta      = 0.10f; // learning rate
-
     bool    penalize_nl       = true;  // consider newlines as a repeatable token
 
-    int32_t n_probs           = 0;     // if greater than 0, output the probabilities of top n_probs tokens.
+    std::string grammar;  // optional BNF-like grammar to constrain sampling
 
     // Classifier-Free Guidance
     // https://arxiv.org/abs/2306.17806
-    std::string cfg_negative_prompt;   // string to help guidance
-    float       cfg_scale     = 1.f;   // How strong is guidance
+    std::string cfg_negative_prompt; // string to help guidance
+    float       cfg_scale     = 1.f; // how strong is guidance
 
     std::unordered_map<llama_token, float> logit_bias; // logit bias for specific tokens
-
 } llama_sampling_params;
 
-// per-sequence sampler context
-typedef struct llama_sampler_sequence_context {
-    float mirostat_mu; // mirostat sampler state
-    llama_grammar * grammar;
-} llama_sampler_sequence_context;
-
 // general sampler context
-typedef struct llama_sampling_context {
-    ~llama_sampling_context();
-
-    // parameters that will be used for sampling and when creating
-    // new llama_sampler_sequence_context instances
+// TODO: move to llama.h
+struct llama_sampling_context {
+    // parameters that will be used for sampling
     llama_sampling_params params;
 
-    // map of sequence ids to sampler contexts
-    std::unordered_map<llama_seq_id, llama_sampler_sequence_context> sequence_contexts;
+    // mirostat sampler state
+    float mirostat_mu;
 
-    // when non-NULL, new instances of llama_sampler_sequence_context
-    // will get a copy of the grammar here
-    // note: only the pointer is stored here, it is not a copy of
-    //       the grammar and shouldn't be freed
     llama_grammar * grammar;
-} llama_sampling_context;
+
+    // internal
+    grammar_parser::parse_state parsed_grammar;
+
+    // TODO: replace with ring-buffer
+    std::vector<llama_token>      prev;
+    std::vector<llama_token_data> cur;
+};
 
 #include "common.h"
 
 // Create a new sampling context instance.
-llama_sampling_context llama_sampling_context_init(
-        const struct gpt_params & params,
-                  llama_grammar * grammar = NULL);
+struct llama_sampling_context * llama_sampling_init(const struct llama_sampling_params & params);
 
-// Fetches the sampler context for the specified sequence id (defaults to 0).
-// If the context for that sequence id doesn't already exist, it will be created with
-// default values based on the parameters in the ctx_sampling argument.
-llama_sampler_sequence_context & llama_sampling_get_sequence_context(
-              llama_sampling_context & ctx_sampling,
-        const llama_seq_id             seq = 0);
+void llama_sampling_free(struct llama_sampling_context * ctx);
 
-// Reset the sampler context for the supplied sequence id (defaults to 0).
-// This is necessary to reuse a sequence id or free memory used by sequences
-// that are no longer required.
-bool llama_sampling_context_reset(
-              llama_sampling_context & ctx_sampling,
-        const llama_seq_id             seq = 0);
+// Reset the sampler context
+// - clear prev tokens
+// - reset grammar
+void llama_sampling_reset(llama_sampling_context * ctx);
+
+// Copy the sampler context
+void llama_sampling_cp(llama_sampling_context * src, llama_sampling_context * dst);
+
+// Get the last sampled token
+llama_token llama_sampling_last(llama_sampling_context * ctx);
+
+// Get a string representation of the last sampled tokens
+std::string llama_sampling_prev_str(llama_sampling_context * ctx_sampling, llama_context * ctx_main, int n);
+
+// Print sampling parameters into a string
+std::string llama_sampling_print(const llama_sampling_params & params);
 
 // this is a common sampling function used across the examples for convenience
 // it can serve as a starting point for implementing your own sampling function
 // Note: When using multiple sequences, it is the caller's responsibility to call
-//       llama_sampling_context_reset when a sequence ends
+//       llama_sampling_reset when a sequence ends
 //
 // required:
-//  - ctx:          context to use for sampling
+//  - ctx_main:     context to use for sampling
 //  - ctx_sampling: sampling-specific context
 //
 // optional:
-//  - ctx_guidance:  context to use for classifier-free guidance, ignore if NULL
-//  - last_tokens:   needed for repetition penalty, ignore if empty
-//  - idx:           sample from llama_get_logits_ith(ctx, idx)
-//  - seq:           sequence id to associate sampler state with
+//  - ctx_cfg:      context to use for classifier-free guidance
+//  - idx:          sample from llama_get_logits_ith(ctx, idx)
 //
 // returns:
 //  - token:      sampled token
 //  - candidates: vector of candidate tokens
 //
 llama_token llama_sampling_sample(
-                  struct llama_context * ctx,
-                  struct llama_context * ctx_guidance,
-                  struct llama_sampling_context & ctx_sampling,
-        const std::vector<llama_token> & last_tokens,
-         std::vector<llama_token_data> & candidates,
-        const                      int   idx = 0,
-                          llama_seq_id   seq = 0);
+        struct llama_sampling_context * ctx_sampling,
+        struct llama_context * ctx_main,
+        struct llama_context * ctx_cfg,
+        int idx = 0);
+
+void llama_sampling_accept(
+        struct llama_sampling_context * ctx_sampling,
+        struct llama_context * ctx_main,
+        llama_token id,
+        bool apply_grammar);
diff --git a/common/stb_image.h b/common/stb_image.h
new file mode 100644
index 000000000..4766d7e67
--- /dev/null
+++ b/common/stb_image.h
@@ -0,0 +1,8396 @@
+/* stb_image - v2.28 - public domain image loader - http://nothings.org/stb
+                                  no warranty implied; use at your own risk
+
+   Do this:
+      #define STB_IMAGE_IMPLEMENTATION
+   before you include this file in *one* C or C++ file to create the implementation.
+
+   // i.e. it should look like this:
+   #include ...
+   #include ...
+   #include ...
+   #define STB_IMAGE_IMPLEMENTATION
+   #include "stb_image.h"
+
+   You can #define STBI_ASSERT(x) before the #include to avoid using assert.h.
+   And #define STBI_MALLOC, STBI_REALLOC, and STBI_FREE to avoid using malloc,realloc,free
+
+
+   QUICK NOTES:
+      Primarily of interest to game developers and other people who can
+          avoid problematic images and only need the trivial interface
+
+      JPEG baseline & progressive (12 bpc/arithmetic not supported, same as stock IJG lib)
+      PNG 1/2/4/8/16-bit-per-channel
+
+      TGA (not sure what subset, if a subset)
+      BMP non-1bpp, non-RLE
+      PSD (composited view only, no extra channels, 8/16 bit-per-channel)
+
+      GIF (*comp always reports as 4-channel)
+      HDR (radiance rgbE format)
+      PIC (Softimage PIC)
+      PNM (PPM and PGM binary only)
+
+      Animated GIF still needs a proper API, but here's one way to do it:
+          http://gist.github.com/urraka/685d9a6340b26b830d49
+
+      - decode from memory or through FILE (define STBI_NO_STDIO to remove code)
+      - decode from arbitrary I/O callbacks
+      - SIMD acceleration on x86/x64 (SSE2) and ARM (NEON)
+
+   Full documentation under "DOCUMENTATION" below.
+
+
+LICENSE
+
+  See end of file for license information.
+
+RECENT REVISION HISTORY:
+
+      2.28  (2023-01-29) many error fixes, security errors, just tons of stuff
+      2.27  (2021-07-11) document stbi_info better, 16-bit PNM support, bug fixes
+      2.26  (2020-07-13) many minor fixes
+      2.25  (2020-02-02) fix warnings
+      2.24  (2020-02-02) fix warnings; thread-local failure_reason and flip_vertically
+      2.23  (2019-08-11) fix clang static analysis warning
+      2.22  (2019-03-04) gif fixes, fix warnings
+      2.21  (2019-02-25) fix typo in comment
+      2.20  (2019-02-07) support utf8 filenames in Windows; fix warnings and platform ifdefs
+      2.19  (2018-02-11) fix warning
+      2.18  (2018-01-30) fix warnings
+      2.17  (2018-01-29) bugfix, 1-bit BMP, 16-bitness query, fix warnings
+      2.16  (2017-07-23) all functions have 16-bit variants; optimizations; bugfixes
+      2.15  (2017-03-18) fix png-1,2,4; all Imagenet JPGs; no runtime SSE detection on GCC
+      2.14  (2017-03-03) remove deprecated STBI_JPEG_OLD; fixes for Imagenet JPGs
+      2.13  (2016-12-04) experimental 16-bit API, only for PNG so far; fixes
+      2.12  (2016-04-02) fix typo in 2.11 PSD fix that caused crashes
+      2.11  (2016-04-02) 16-bit PNGS; enable SSE2 in non-gcc x64
+                         RGB-format JPEG; remove white matting in PSD;
+                         allocate large structures on the stack;
+                         correct channel count for PNG & BMP
+      2.10  (2016-01-22) avoid warning introduced in 2.09
+      2.09  (2016-01-16) 16-bit TGA; comments in PNM files; STBI_REALLOC_SIZED
+
+   See end of file for full revision history.
+
+
+ ============================    Contributors    =========================
+
+ Image formats                          Extensions, features
+    Sean Barrett (jpeg, png, bmp)          Jetro Lauha (stbi_info)
+    Nicolas Schulz (hdr, psd)              Martin "SpartanJ" Golini (stbi_info)
+    Jonathan Dummer (tga)                  James "moose2000" Brown (iPhone PNG)
+    Jean-Marc Lienher (gif)                Ben "Disch" Wenger (io callbacks)
+    Tom Seddon (pic)                       Omar Cornut (1/2/4-bit PNG)
+    Thatcher Ulrich (psd)                  Nicolas Guillemot (vertical flip)
+    Ken Miller (pgm, ppm)                  Richard Mitton (16-bit PSD)
+    github:urraka (animated gif)           Junggon Kim (PNM comments)
+    Christopher Forseth (animated gif)     Daniel Gibson (16-bit TGA)
+                                           socks-the-fox (16-bit PNG)
+                                           Jeremy Sawicki (handle all ImageNet JPGs)
+ Optimizations & bugfixes                  Mikhail Morozov (1-bit BMP)
+    Fabian "ryg" Giesen                    Anael Seghezzi (is-16-bit query)
+    Arseny Kapoulkine                      Simon Breuss (16-bit PNM)
+    John-Mark Allen
+    Carmelo J Fdez-Aguera
+
+ Bug & warning fixes
+    Marc LeBlanc            David Woo          Guillaume George     Martins Mozeiko
+    Christpher Lloyd        Jerry Jansson      Joseph Thomson       Blazej Dariusz Roszkowski
+    Phil Jordan                                Dave Moore           Roy Eltham
+    Hayaki Saito            Nathan Reed        Won Chun
+    Luke Graham             Johan Duparc       Nick Verigakis       the Horde3D community
+    Thomas Ruf              Ronny Chevalier                         github:rlyeh
+    Janez Zemva             John Bartholomew   Michal Cichon        github:romigrou
+    Jonathan Blow           Ken Hamada         Tero Hanninen        github:svdijk
+    Eugene Golushkov        Laurent Gomila     Cort Stratton        github:snagar
+    Aruelien Pocheville     Sergio Gonzalez    Thibault Reuille     github:Zelex
+    Cass Everitt            Ryamond Barbiero                        github:grim210
+    Paul Du Bois            Engin Manap        Aldo Culquicondor    github:sammyhw
+    Philipp Wiesemann       Dale Weiler        Oriol Ferrer Mesia   github:phprus
+    Josh Tobin              Neil Bickford      Matthew Gregan       github:poppolopoppo
+    Julian Raschke          Gregory Mullen     Christian Floisand   github:darealshinji
+    Baldur Karlsson         Kevin Schmidt      JR Smith             github:Michaelangel007
+                            Brad Weinberger    Matvey Cherevko      github:mosra
+    Luca Sas                Alexander Veselov  Zack Middleton       [reserved]
+    Ryan C. Gordon          [reserved]                              [reserved]
+                     DO NOT ADD YOUR NAME HERE
+
+                     Jacko Dirks
+
+  To add your name to the credits, pick a random blank space in the middle and fill it.
+  80% of merge conflicts on stb PRs are due to people adding their name at the end
+  of the credits.
+*/
+
+#ifndef STBI_INCLUDE_STB_IMAGE_H
+#define STBI_INCLUDE_STB_IMAGE_H
+
+// DOCUMENTATION
+//
+// Limitations:
+//    - no 12-bit-per-channel JPEG
+//    - no JPEGs with arithmetic coding
+//    - GIF always returns *comp=4
+//
+// Basic usage (see HDR discussion below for HDR usage):
+//    int x,y,n;
+//    unsigned char *data = stbi_load(filename, &x, &y, &n, 0);
+//    // ... process data if not NULL ...
+//    // ... x = width, y = height, n = # 8-bit components per pixel ...
+//    // ... replace '0' with '1'..'4' to force that many components per pixel
+//    // ... but 'n' will always be the number that it would have been if you said 0
+//    stbi_image_free(data);
+//
+// Standard parameters:
+//    int *x                 -- outputs image width in pixels
+//    int *y                 -- outputs image height in pixels
+//    int *channels_in_file  -- outputs # of image components in image file
+//    int desired_channels   -- if non-zero, # of image components requested in result
+//
+// The return value from an image loader is an 'unsigned char *' which points
+// to the pixel data, or NULL on an allocation failure or if the image is
+// corrupt or invalid. The pixel data consists of *y scanlines of *x pixels,
+// with each pixel consisting of N interleaved 8-bit components; the first
+// pixel pointed to is top-left-most in the image. There is no padding between
+// image scanlines or between pixels, regardless of format. The number of
+// components N is 'desired_channels' if desired_channels is non-zero, or
+// *channels_in_file otherwise. If desired_channels is non-zero,
+// *channels_in_file has the number of components that _would_ have been
+// output otherwise. E.g. if you set desired_channels to 4, you will always
+// get RGBA output, but you can check *channels_in_file to see if it's trivially
+// opaque because e.g. there were only 3 channels in the source image.
+//
+// An output image with N components has the following components interleaved
+// in this order in each pixel:
+//
+//     N=#comp     components
+//       1           grey
+//       2           grey, alpha
+//       3           red, green, blue
+//       4           red, green, blue, alpha
+//
+// If image loading fails for any reason, the return value will be NULL,
+// and *x, *y, *channels_in_file will be unchanged. The function
+// stbi_failure_reason() can be queried for an extremely brief, end-user
+// unfriendly explanation of why the load failed. Define STBI_NO_FAILURE_STRINGS
+// to avoid compiling these strings at all, and STBI_FAILURE_USERMSG to get slightly
+// more user-friendly ones.
+//
+// Paletted PNG, BMP, GIF, and PIC images are automatically depalettized.
+//
+// To query the width, height and component count of an image without having to
+// decode the full file, you can use the stbi_info family of functions:
+//
+//   int x,y,n,ok;
+//   ok = stbi_info(filename, &x, &y, &n);
+//   // returns ok=1 and sets x, y, n if image is a supported format,
+//   // 0 otherwise.
+//
+// Note that stb_image pervasively uses ints in its public API for sizes,
+// including sizes of memory buffers. This is now part of the API and thus
+// hard to change without causing breakage. As a result, the various image
+// loaders all have certain limits on image size; these differ somewhat
+// by format but generally boil down to either just under 2GB or just under
+// 1GB. When the decoded image would be larger than this, stb_image decoding
+// will fail.
+//
+// Additionally, stb_image will reject image files that have any of their
+// dimensions set to a larger value than the configurable STBI_MAX_DIMENSIONS,
+// which defaults to 2**24 = 16777216 pixels. Due to the above memory limit,
+// the only way to have an image with such dimensions load correctly
+// is for it to have a rather extreme aspect ratio. Either way, the
+// assumption here is that such larger images are likely to be malformed
+// or malicious. If you do need to load an image with individual dimensions
+// larger than that, and it still fits in the overall size limit, you can
+// #define STBI_MAX_DIMENSIONS on your own to be something larger.
+//
+// ===========================================================================
+//
+// UNICODE:
+//
+//   If compiling for Windows and you wish to use Unicode filenames, compile
+//   with
+//       #define STBI_WINDOWS_UTF8
+//   and pass utf8-encoded filenames. Call stbi_convert_wchar_to_utf8 to convert
+//   Windows wchar_t filenames to utf8.
+//
+// ===========================================================================
+//
+// Philosophy
+//
+// stb libraries are designed with the following priorities:
+//
+//    1. easy to use
+//    2. easy to maintain
+//    3. good performance
+//
+// Sometimes I let "good performance" creep up in priority over "easy to maintain",
+// and for best performance I may provide less-easy-to-use APIs that give higher
+// performance, in addition to the easy-to-use ones. Nevertheless, it's important
+// to keep in mind that from the standpoint of you, a client of this library,
+// all you care about is #1 and #3, and stb libraries DO NOT emphasize #3 above all.
+//
+// Some secondary priorities arise directly from the first two, some of which
+// provide more explicit reasons why performance can't be emphasized.
+//
+//    - Portable ("ease of use")
+//    - Small source code footprint ("easy to maintain")
+//    - No dependencies ("ease of use")
+//
+// ===========================================================================
+//
+// I/O callbacks
+//
+// I/O callbacks allow you to read from arbitrary sources, like packaged
+// files or some other source. Data read from callbacks are processed
+// through a small internal buffer (currently 128 bytes) to try to reduce
+// overhead.
+//
+// The three functions you must define are "read" (reads some bytes of data),
+// "skip" (skips some bytes of data), "eof" (reports if the stream is at the end).
+//
+// ===========================================================================
+//
+// SIMD support
+//
+// The JPEG decoder will try to automatically use SIMD kernels on x86 when
+// supported by the compiler. For ARM Neon support, you must explicitly
+// request it.
+//
+// (The old do-it-yourself SIMD API is no longer supported in the current
+// code.)
+//
+// On x86, SSE2 will automatically be used when available based on a run-time
+// test; if not, the generic C versions are used as a fall-back. On ARM targets,
+// the typical path is to have separate builds for NEON and non-NEON devices
+// (at least this is true for iOS and Android). Therefore, the NEON support is
+// toggled by a build flag: define STBI_NEON to get NEON loops.
+//
+// If for some reason you do not want to use any of SIMD code, or if
+// you have issues compiling it, you can disable it entirely by
+// defining STBI_NO_SIMD.
+//
+// ===========================================================================
+//
+// HDR image support   (disable by defining STBI_NO_HDR)
+//
+// stb_image supports loading HDR images in general, and currently the Radiance
+// .HDR file format specifically. You can still load any file through the existing
+// interface; if you attempt to load an HDR file, it will be automatically remapped
+// to LDR, assuming gamma 2.2 and an arbitrary scale factor defaulting to 1;
+// both of these constants can be reconfigured through this interface:
+//
+//     stbi_hdr_to_ldr_gamma(2.2f);
+//     stbi_hdr_to_ldr_scale(1.0f);
+//
+// (note, do not use _inverse_ constants; stbi_image will invert them
+// appropriately).
+//
+// Additionally, there is a new, parallel interface for loading files as
+// (linear) floats to preserve the full dynamic range:
+//
+//    float *data = stbi_loadf(filename, &x, &y, &n, 0);
+//
+// If you load LDR images through this interface, those images will
+// be promoted to floating point values, run through the inverse of
+// constants corresponding to the above:
+//
+//     stbi_ldr_to_hdr_scale(1.0f);
+//     stbi_ldr_to_hdr_gamma(2.2f);
+//
+// Finally, given a filename (or an open file or memory block--see header
+// file for details) containing image data, you can query for the "most
+// appropriate" interface to use (that is, whether the image is HDR or
+// not), using:
+//
+//     stbi_is_hdr(char *filename);
+//
+// ===========================================================================
+//
+// iPhone PNG support:
+//
+// We optionally support converting iPhone-formatted PNGs (which store
+// premultiplied BGRA) back to RGB, even though they're internally encoded
+// differently. To enable this conversion, call
+// stbi_convert_iphone_png_to_rgb(1).
+//
+// Call stbi_set_unpremultiply_on_load(1) as well to force a divide per
+// pixel to remove any premultiplied alpha *only* if the image file explicitly
+// says there's premultiplied data (currently only happens in iPhone images,
+// and only if iPhone convert-to-rgb processing is on).
+//
+// ===========================================================================
+//
+// ADDITIONAL CONFIGURATION
+//
+//  - You can suppress implementation of any of the decoders to reduce
+//    your code footprint by #defining one or more of the following
+//    symbols before creating the implementation.
+//
+//        STBI_NO_JPEG
+//        STBI_NO_PNG
+//        STBI_NO_BMP
+//        STBI_NO_PSD
+//        STBI_NO_TGA
+//        STBI_NO_GIF
+//        STBI_NO_HDR
+//        STBI_NO_PIC
+//        STBI_NO_PNM   (.ppm and .pgm)
+//
+//  - You can request *only* certain decoders and suppress all other ones
+//    (this will be more forward-compatible, as addition of new decoders
+//    doesn't require you to disable them explicitly):
+//
+//        STBI_ONLY_JPEG
+//        STBI_ONLY_PNG
+//        STBI_ONLY_BMP
+//        STBI_ONLY_PSD
+//        STBI_ONLY_TGA
+//        STBI_ONLY_GIF
+//        STBI_ONLY_HDR
+//        STBI_ONLY_PIC
+//        STBI_ONLY_PNM   (.ppm and .pgm)
+//
+//   - If you use STBI_NO_PNG (or _ONLY_ without PNG), and you still
+//     want the zlib decoder to be available, #define STBI_SUPPORT_ZLIB
+//
+//  - If you define STBI_MAX_DIMENSIONS, stb_image will reject images greater
+//    than that size (in either width or height) without further processing.
+//    This is to let programs in the wild set an upper bound to prevent
+//    denial-of-service attacks on untrusted data, as one could generate a
+//    valid image of gigantic dimensions and force stb_image to allocate a
+//    huge block of memory and spend disproportionate time decoding it. By
+//    default this is set to (1 << 24), which is 16777216, but that's still
+//    very big.
+
+#ifndef STBI_NO_STDIO
+#include <stdio.h>
+#endif // STBI_NO_STDIO
+
+#define STBI_VERSION 1
+
+enum {
+    STBI_default = 0, // only used for desired_channels
+
+    STBI_grey = 1,
+    STBI_grey_alpha = 2,
+    STBI_rgb = 3,
+    STBI_rgb_alpha = 4
+};
+
+#include <stdlib.h>
+typedef unsigned char stbi_uc;
+typedef unsigned short stbi_us;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef STBIDEF
+#ifdef STB_IMAGE_STATIC
+#define STBIDEF static
+#else
+#define STBIDEF extern
+#endif
+#endif
+
+//////////////////////////////////////////////////////////////////////////////
+//
+// PRIMARY API - works on images of any type
+//
+
+//
+// load image by filename, open file, or memory buffer
+//
+
+typedef struct {
+    int (*read)(void * user, char * data,
+                int size);            // fill 'data' with 'size' bytes.  return number of bytes actually read
+    void (*skip)(void * user, int n); // skip the next 'n' bytes, or 'unget' the last -n bytes if negative
+    int (*eof)(void * user);          // returns nonzero if we are at end of file/data
+} stbi_io_callbacks;
+
+////////////////////////////////////
+//
+// 8-bits-per-channel interface
+//
+
+STBIDEF stbi_uc * stbi_load_from_memory(stbi_uc const * buffer, int len, int * x, int * y, int * channels_in_file,
+                                        int desired_channels);
+STBIDEF stbi_uc * stbi_load_from_callbacks(stbi_io_callbacks const * clbk, void * user, int * x, int * y,
+                                           int * channels_in_file, int desired_channels);
+
+#ifndef STBI_NO_STDIO
+STBIDEF stbi_uc * stbi_load(char const * filename, int * x, int * y, int * channels_in_file, int desired_channels);
+STBIDEF stbi_uc * stbi_load_from_file(FILE * f, int * x, int * y, int * channels_in_file, int desired_channels);
+// for stbi_load_from_file, file pointer is left pointing immediately after image
+#endif
+
+#ifndef STBI_NO_GIF
+STBIDEF stbi_uc * stbi_load_gif_from_memory(stbi_uc const * buffer, int len, int ** delays, int * x, int * y, int * z,
+                                            int * comp, int req_comp);
+#endif
+
+#ifdef STBI_WINDOWS_UTF8
+STBIDEF int stbi_convert_wchar_to_utf8(char * buffer, size_t bufferlen, const wchar_t * input);
+#endif
+
+////////////////////////////////////
+//
+// 16-bits-per-channel interface
+//
+
+STBIDEF stbi_us * stbi_load_16_from_memory(stbi_uc const * buffer, int len, int * x, int * y, int * channels_in_file,
+                                           int desired_channels);
+STBIDEF stbi_us * stbi_load_16_from_callbacks(stbi_io_callbacks const * clbk, void * user, int * x, int * y,
+                                              int * channels_in_file, int desired_channels);
+
+#ifndef STBI_NO_STDIO
+STBIDEF stbi_us * stbi_load_16(char const * filename, int * x, int * y, int * channels_in_file, int desired_channels);
+STBIDEF stbi_us * stbi_load_from_file_16(FILE * f, int * x, int * y, int * channels_in_file, int desired_channels);
+#endif
+
+////////////////////////////////////
+//
+// float-per-channel interface
+//
+#ifndef STBI_NO_LINEAR
+STBIDEF float * stbi_loadf_from_memory(stbi_uc const * buffer, int len, int * x, int * y, int * channels_in_file,
+                                       int desired_channels);
+STBIDEF float * stbi_loadf_from_callbacks(stbi_io_callbacks const * clbk, void * user, int * x, int * y, int * channels_in_file,
+                                          int desired_channels);
+
+#ifndef STBI_NO_STDIO
+STBIDEF float * stbi_loadf(char const * filename, int * x, int * y, int * channels_in_file, int desired_channels);
+STBIDEF float * stbi_loadf_from_file(FILE * f, int * x, int * y, int * channels_in_file, int desired_channels);
+#endif
+#endif
+
+#ifndef STBI_NO_HDR
+STBIDEF void stbi_hdr_to_ldr_gamma(float gamma);
+STBIDEF void stbi_hdr_to_ldr_scale(float scale);
+#endif // STBI_NO_HDR
+
+#ifndef STBI_NO_LINEAR
+STBIDEF void stbi_ldr_to_hdr_gamma(float gamma);
+STBIDEF void stbi_ldr_to_hdr_scale(float scale);
+#endif // STBI_NO_LINEAR
+
+// stbi_is_hdr is always defined, but always returns false if STBI_NO_HDR
+STBIDEF int stbi_is_hdr_from_callbacks(stbi_io_callbacks const * clbk, void * user);
+STBIDEF int stbi_is_hdr_from_memory(stbi_uc const * buffer, int len);
+#ifndef STBI_NO_STDIO
+STBIDEF int stbi_is_hdr(char const * filename);
+STBIDEF int stbi_is_hdr_from_file(FILE * f);
+#endif // STBI_NO_STDIO
+
+// get a VERY brief reason for failure
+// on most compilers (and ALL modern mainstream compilers) this is threadsafe
+STBIDEF const char * stbi_failure_reason(void);
+
+// free the loaded image -- this is just free()
+STBIDEF void stbi_image_free(void * retval_from_stbi_load);
+
+// get image dimensions & components without fully decoding
+STBIDEF int stbi_info_from_memory(stbi_uc const * buffer, int len, int * x, int * y, int * comp);
+STBIDEF int stbi_info_from_callbacks(stbi_io_callbacks const * clbk, void * user, int * x, int * y, int * comp);
+STBIDEF int stbi_is_16_bit_from_memory(stbi_uc const * buffer, int len);
+STBIDEF int stbi_is_16_bit_from_callbacks(stbi_io_callbacks const * clbk, void * user);
+
+#ifndef STBI_NO_STDIO
+STBIDEF int stbi_info(char const * filename, int * x, int * y, int * comp);
+STBIDEF int stbi_info_from_file(FILE * f, int * x, int * y, int * comp);
+STBIDEF int stbi_is_16_bit(char const * filename);
+STBIDEF int stbi_is_16_bit_from_file(FILE * f);
+#endif
+
+// for image formats that explicitly notate that they have premultiplied alpha,
+// we just return the colors as stored in the file. set this flag to force
+// unpremultiplication. results are undefined if the unpremultiply overflow.
+STBIDEF void stbi_set_unpremultiply_on_load(int flag_true_if_should_unpremultiply);
+
+// indicate whether we should process iphone images back to canonical format,
+// or just pass them through "as-is"
+STBIDEF void stbi_convert_iphone_png_to_rgb(int flag_true_if_should_convert);
+
+// flip the image vertically, so the first pixel in the output array is the bottom left
+STBIDEF void stbi_set_flip_vertically_on_load(int flag_true_if_should_flip);
+
+// as above, but only applies to images loaded on the thread that calls the function
+// this function is only available if your compiler supports thread-local variables;
+// calling it will fail to link if your compiler doesn't
+STBIDEF void stbi_set_unpremultiply_on_load_thread(int flag_true_if_should_unpremultiply);
+STBIDEF void stbi_convert_iphone_png_to_rgb_thread(int flag_true_if_should_convert);
+STBIDEF void stbi_set_flip_vertically_on_load_thread(int flag_true_if_should_flip);
+
+// ZLIB client - used by PNG, available for other purposes
+
+STBIDEF char * stbi_zlib_decode_malloc_guesssize(const char * buffer, int len, int initial_size, int * outlen);
+STBIDEF char * stbi_zlib_decode_malloc_guesssize_headerflag(const char * buffer, int len, int initial_size, int * outlen,
+                                                            int parse_header);
+STBIDEF char * stbi_zlib_decode_malloc(const char * buffer, int len, int * outlen);
+STBIDEF int stbi_zlib_decode_buffer(char * obuffer, int olen, const char * ibuffer, int ilen);
+
+STBIDEF char * stbi_zlib_decode_noheader_malloc(const char * buffer, int len, int * outlen);
+STBIDEF int stbi_zlib_decode_noheader_buffer(char * obuffer, int olen, const char * ibuffer, int ilen);
+
+#ifdef __cplusplus
+}
+#endif
+
+//
+//
+////   end header file   /////////////////////////////////////////////////////
+#endif // STBI_INCLUDE_STB_IMAGE_H
+
+#ifdef STB_IMAGE_IMPLEMENTATION
+
+#if defined(STBI_ONLY_JPEG) || defined(STBI_ONLY_PNG) || defined(STBI_ONLY_BMP) || defined(STBI_ONLY_TGA) ||                   \
+    defined(STBI_ONLY_GIF) || defined(STBI_ONLY_PSD) || defined(STBI_ONLY_HDR) || defined(STBI_ONLY_PIC) ||                    \
+    defined(STBI_ONLY_PNM) || defined(STBI_ONLY_ZLIB)
+#ifndef STBI_ONLY_JPEG
+#define STBI_NO_JPEG
+#endif
+#ifndef STBI_ONLY_PNG
+#define STBI_NO_PNG
+#endif
+#ifndef STBI_ONLY_BMP
+#define STBI_NO_BMP
+#endif
+#ifndef STBI_ONLY_PSD
+#define STBI_NO_PSD
+#endif
+#ifndef STBI_ONLY_TGA
+#define STBI_NO_TGA
+#endif
+#ifndef STBI_ONLY_GIF
+#define STBI_NO_GIF
+#endif
+#ifndef STBI_ONLY_HDR
+#define STBI_NO_HDR
+#endif
+#ifndef STBI_ONLY_PIC
+#define STBI_NO_PIC
+#endif
+#ifndef STBI_ONLY_PNM
+#define STBI_NO_PNM
+#endif
+#endif
+
+#if defined(STBI_NO_PNG) && !defined(STBI_SUPPORT_ZLIB) && !defined(STBI_NO_ZLIB)
+#define STBI_NO_ZLIB
+#endif
+
+#include <limits.h>
+#include <stdarg.h>
+#include <stddef.h> // ptrdiff_t on osx
+#include <stdlib.h>
+#include <string.h>
+
+#if !defined(STBI_NO_LINEAR) || !defined(STBI_NO_HDR)
+#include <math.h> // ldexp, pow
+#endif
+
+#ifndef STBI_NO_STDIO
+#include <stdio.h>
+#endif
+
+#ifndef STBI_ASSERT
+#include <assert.h>
+#define STBI_ASSERT(x) assert(x)
+#endif
+
+#ifdef __cplusplus
+#define STBI_EXTERN extern "C"
+#else
+#define STBI_EXTERN extern
+#endif
+
+#ifndef _MSC_VER
+#ifdef __cplusplus
+#define stbi_inline inline
+#else
+#define stbi_inline
+#endif
+#else
+#define stbi_inline __forceinline
+#endif
+
+#ifndef STBI_NO_THREAD_LOCALS
+#if defined(__cplusplus) && __cplusplus >= 201103L
+#define STBI_THREAD_LOCAL thread_local
+#elif defined(__GNUC__) && __GNUC__ < 5
+#define STBI_THREAD_LOCAL __thread
+#elif defined(_MSC_VER)
+#define STBI_THREAD_LOCAL __declspec(thread)
+#elif defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L && !defined(__STDC_NO_THREADS__)
+#define STBI_THREAD_LOCAL _Thread_local
+#endif
+
+#ifndef STBI_THREAD_LOCAL
+#if defined(__GNUC__)
+#define STBI_THREAD_LOCAL __thread
+#endif
+#endif
+#endif
+
+#if defined(_MSC_VER) || defined(__SYMBIAN32__)
+typedef unsigned short stbi__uint16;
+typedef signed short stbi__int16;
+typedef unsigned int stbi__uint32;
+typedef signed int stbi__int32;
+#else
+#include <stdint.h>
+typedef uint16_t stbi__uint16;
+typedef int16_t stbi__int16;
+typedef uint32_t stbi__uint32;
+typedef int32_t stbi__int32;
+#endif
+
+// should produce compiler error if size is wrong
+typedef unsigned char validate_uint32[sizeof(stbi__uint32) == 4 ? 1 : -1];
+
+#ifdef _MSC_VER
+#define STBI_NOTUSED(v) (void)(v)
+#else
+#define STBI_NOTUSED(v) (void)sizeof(v)
+#endif
+
+#ifdef _MSC_VER
+#define STBI_HAS_LROTL
+#endif
+
+#ifdef STBI_HAS_LROTL
+#define stbi_lrot(x, y) _lrotl(x, y)
+#else
+#define stbi_lrot(x, y) (((x) << (y)) | ((x) >> (-(y)&31)))
+#endif
+
+#if defined(STBI_MALLOC) && defined(STBI_FREE) && (defined(STBI_REALLOC) || defined(STBI_REALLOC_SIZED))
+// ok
+#elif !defined(STBI_MALLOC) && !defined(STBI_FREE) && !defined(STBI_REALLOC) && !defined(STBI_REALLOC_SIZED)
+// ok
+#else
+#error "Must define all or none of STBI_MALLOC, STBI_FREE, and STBI_REALLOC (or STBI_REALLOC_SIZED)."
+#endif
+
+#ifndef STBI_MALLOC
+#define STBI_MALLOC(sz) malloc(sz)
+#define STBI_REALLOC(p, newsz) realloc(p, newsz)
+#define STBI_FREE(p) free(p)
+#endif
+
+#ifndef STBI_REALLOC_SIZED
+#define STBI_REALLOC_SIZED(p, oldsz, newsz) STBI_REALLOC(p, newsz)
+#endif
+
+// x86/x64 detection
+#if defined(__x86_64__) || defined(_M_X64)
+#define STBI__X64_TARGET
+#elif defined(__i386) || defined(_M_IX86)
+#define STBI__X86_TARGET
+#endif
+
+#if defined(__GNUC__) && defined(STBI__X86_TARGET) && !defined(__SSE2__) && !defined(STBI_NO_SIMD)
+// gcc doesn't support sse2 intrinsics unless you compile with -msse2,
+// which in turn means it gets to use SSE2 everywhere. This is unfortunate,
+// but previous attempts to provide the SSE2 functions with runtime
+// detection caused numerous issues. The way architecture extensions are
+// exposed in GCC/Clang is, sadly, not really suited for one-file libs.
+// New behavior: if compiled with -msse2, we use SSE2 without any
+// detection; if not, we don't use it at all.
+#define STBI_NO_SIMD
+#endif
+
+#if defined(__MINGW32__) && defined(STBI__X86_TARGET) && !defined(STBI_MINGW_ENABLE_SSE2) && !defined(STBI_NO_SIMD)
+// Note that __MINGW32__ doesn't actually mean 32-bit, so we have to avoid STBI__X64_TARGET
+//
+// 32-bit MinGW wants ESP to be 16-byte aligned, but this is not in the
+// Windows ABI and VC++ as well as Windows DLLs don't maintain that invariant.
+// As a result, enabling SSE2 on 32-bit MinGW is dangerous when not
+// simultaneously enabling "-mstackrealign".
+//
+// See https://github.com/nothings/stb/issues/81 for more information.
+//
+// So default to no SSE2 on 32-bit MinGW. If you've read this far and added
+// -mstackrealign to your build settings, feel free to #define STBI_MINGW_ENABLE_SSE2.
+#define STBI_NO_SIMD
+#endif
+
+#if !defined(STBI_NO_SIMD) && (defined(STBI__X86_TARGET) || defined(STBI__X64_TARGET))
+#define STBI_SSE2
+#include <emmintrin.h>
+
+#ifdef _MSC_VER
+
+#if _MSC_VER >= 1400 // not VC6
+#include <intrin.h>  // __cpuid
+static int stbi__cpuid3(void) {
+    int info[4];
+    __cpuid(info, 1);
+    return info[3];
+}
+#else
+static int stbi__cpuid3(void) {
+    int res;
+    __asm {
+      mov  eax,1
+      cpuid
+      mov  res,edx
+    }
+    return res;
+}
+#endif
+
+#define STBI_SIMD_ALIGN(type, name) __declspec(align(16)) type name
+
+#if !defined(STBI_NO_JPEG) && defined(STBI_SSE2)
+static int stbi__sse2_available(void) {
+    int info3 = stbi__cpuid3();
+    return ((info3 >> 26) & 1) != 0;
+}
+#endif
+
+#else // assume GCC-style if not VC++
+#define STBI_SIMD_ALIGN(type, name) type name __attribute__((aligned(16)))
+
+#if !defined(STBI_NO_JPEG) && defined(STBI_SSE2)
+static int stbi__sse2_available(void) {
+    // If we're even attempting to compile this on GCC/Clang, that means
+    // -msse2 is on, which means the compiler is allowed to use SSE2
+    // instructions at will, and so are we.
+    return 1;
+}
+#endif
+
+#endif
+#endif
+
+// ARM NEON
+#if defined(STBI_NO_SIMD) && defined(STBI_NEON)
+#undef STBI_NEON
+#endif
+
+#ifdef STBI_NEON
+#include <arm_neon.h>
+#ifdef _MSC_VER
+#define STBI_SIMD_ALIGN(type, name) __declspec(align(16)) type name
+#else
+#define STBI_SIMD_ALIGN(type, name) type name __attribute__((aligned(16)))
+#endif
+#endif
+
+#ifndef STBI_SIMD_ALIGN
+#define STBI_SIMD_ALIGN(type, name) type name
+#endif
+
+#ifndef STBI_MAX_DIMENSIONS
+#define STBI_MAX_DIMENSIONS (1 << 24)
+#endif
+
+///////////////////////////////////////////////
+//
+//  stbi__context struct and start_xxx functions
+
+// stbi__context structure is our basic context used by all images, so it
+// contains all the IO context, plus some basic image information
+typedef struct {
+    stbi__uint32 img_x, img_y;
+    int img_n, img_out_n;
+
+    stbi_io_callbacks io;
+    void * io_user_data;
+
+    int read_from_callbacks;
+    int buflen;
+    stbi_uc buffer_start[128];
+    int callback_already_read;
+
+    stbi_uc *img_buffer, *img_buffer_end;
+    stbi_uc *img_buffer_original, *img_buffer_original_end;
+} stbi__context;
+
+static void stbi__refill_buffer(stbi__context * s);
+
+// initialize a memory-decode context
+static void stbi__start_mem(stbi__context * s, stbi_uc const * buffer, int len) {
+    s->io.read = NULL;
+    s->read_from_callbacks = 0;
+    s->callback_already_read = 0;
+    s->img_buffer = s->img_buffer_original = (stbi_uc *)buffer;
+    s->img_buffer_end = s->img_buffer_original_end = (stbi_uc *)buffer + len;
+}
+
+// initialize a callback-based context
+static void stbi__start_callbacks(stbi__context * s, stbi_io_callbacks * c, void * user) {
+    s->io = *c;
+    s->io_user_data = user;
+    s->buflen = sizeof(s->buffer_start);
+    s->read_from_callbacks = 1;
+    s->callback_already_read = 0;
+    s->img_buffer = s->img_buffer_original = s->buffer_start;
+    stbi__refill_buffer(s);
+    s->img_buffer_original_end = s->img_buffer_end;
+}
+
+#ifndef STBI_NO_STDIO
+
+static int stbi__stdio_read(void * user, char * data, int size) { return (int)fread(data, 1, size, (FILE *)user); }
+
+static void stbi__stdio_skip(void * user, int n) {
+    int ch;
+    fseek((FILE *)user, n, SEEK_CUR);
+    ch = fgetc((FILE *)user); /* have to read a byte to reset feof()'s flag */
+    if (ch != EOF) {
+        ungetc(ch, (FILE *)user); /* push byte back onto stream if valid. */
+    }
+}
+
+static int stbi__stdio_eof(void * user) { return feof((FILE *)user) || ferror((FILE *)user); }
+
+static stbi_io_callbacks stbi__stdio_callbacks = {
+    stbi__stdio_read,
+    stbi__stdio_skip,
+    stbi__stdio_eof,
+};
+
+static void stbi__start_file(stbi__context * s, FILE * f) { stbi__start_callbacks(s, &stbi__stdio_callbacks, (void *)f); }
+
+// static void stop_file(stbi__context *s) { }
+
+#endif // !STBI_NO_STDIO
+
+static void stbi__rewind(stbi__context * s) {
+    // conceptually rewind SHOULD rewind to the beginning of the stream,
+    // but we just rewind to the beginning of the initial buffer, because
+    // we only use it after doing 'test', which only ever looks at at most 92 bytes
+    s->img_buffer = s->img_buffer_original;
+    s->img_buffer_end = s->img_buffer_original_end;
+}
+
+enum { STBI_ORDER_RGB, STBI_ORDER_BGR };
+
+typedef struct {
+    int bits_per_channel;
+    int num_channels;
+    int channel_order;
+} stbi__result_info;
+
+#ifndef STBI_NO_JPEG
+static int stbi__jpeg_test(stbi__context * s);
+static void * stbi__jpeg_load(stbi__context * s, int * x, int * y, int * comp, int req_comp, stbi__result_info * ri);
+static int stbi__jpeg_info(stbi__context * s, int * x, int * y, int * comp);
+#endif
+
+#ifndef STBI_NO_PNG
+static int stbi__png_test(stbi__context * s);
+static void * stbi__png_load(stbi__context * s, int * x, int * y, int * comp, int req_comp, stbi__result_info * ri);
+static int stbi__png_info(stbi__context * s, int * x, int * y, int * comp);
+static int stbi__png_is16(stbi__context * s);
+#endif
+
+#ifndef STBI_NO_BMP
+static int stbi__bmp_test(stbi__context * s);
+static void * stbi__bmp_load(stbi__context * s, int * x, int * y, int * comp, int req_comp, stbi__result_info * ri);
+static int stbi__bmp_info(stbi__context * s, int * x, int * y, int * comp);
+#endif
+
+#ifndef STBI_NO_TGA
+static int stbi__tga_test(stbi__context * s);
+static void * stbi__tga_load(stbi__context * s, int * x, int * y, int * comp, int req_comp, stbi__result_info * ri);
+static int stbi__tga_info(stbi__context * s, int * x, int * y, int * comp);
+#endif
+
+#ifndef STBI_NO_PSD
+static int stbi__psd_test(stbi__context * s);
+static void * stbi__psd_load(stbi__context * s, int * x, int * y, int * comp, int req_comp, stbi__result_info * ri, int bpc);
+static int stbi__psd_info(stbi__context * s, int * x, int * y, int * comp);
+static int stbi__psd_is16(stbi__context * s);
+#endif
+
+#ifndef STBI_NO_HDR
+static int stbi__hdr_test(stbi__context * s);
+static float * stbi__hdr_load(stbi__context * s, int * x, int * y, int * comp, int req_comp, stbi__result_info * ri);
+static int stbi__hdr_info(stbi__context * s, int * x, int * y, int * comp);
+#endif
+
+#ifndef STBI_NO_PIC
+static int stbi__pic_test(stbi__context * s);
+static void * stbi__pic_load(stbi__context * s, int * x, int * y, int * comp, int req_comp, stbi__result_info * ri);
+static int stbi__pic_info(stbi__context * s, int * x, int * y, int * comp);
+#endif
+
+#ifndef STBI_NO_GIF
+static int stbi__gif_test(stbi__context * s);
+static void * stbi__gif_load(stbi__context * s, int * x, int * y, int * comp, int req_comp, stbi__result_info * ri);
+static void * stbi__load_gif_main(stbi__context * s, int ** delays, int * x, int * y, int * z, int * comp, int req_comp);
+static int stbi__gif_info(stbi__context * s, int * x, int * y, int * comp);
+#endif
+
+#ifndef STBI_NO_PNM
+static int stbi__pnm_test(stbi__context * s);
+static void * stbi__pnm_load(stbi__context * s, int * x, int * y, int * comp, int req_comp, stbi__result_info * ri);
+static int stbi__pnm_info(stbi__context * s, int * x, int * y, int * comp);
+static int stbi__pnm_is16(stbi__context * s);
+#endif
+
+static
+#ifdef STBI_THREAD_LOCAL
+    STBI_THREAD_LOCAL
+#endif
+    const char * stbi__g_failure_reason;
+
+STBIDEF const char * stbi_failure_reason(void) { return stbi__g_failure_reason; }
+
+#ifndef STBI_NO_FAILURE_STRINGS
+static int stbi__err(const char * str) {
+    stbi__g_failure_reason = str;
+    return 0;
+}
+#endif
+
+static void * stbi__malloc(size_t size) { return STBI_MALLOC(size); }
+
+// stb_image uses ints pervasively, including for offset calculations.
+// therefore the largest decoded image size we can support with the
+// current code, even on 64-bit targets, is INT_MAX. this is not a
+// significant limitation for the intended use case.
+//
+// we do, however, need to make sure our size calculations don't
+// overflow. hence a few helper functions for size calculations that
+// multiply integers together, making sure that they're non-negative
+// and no overflow occurs.
+
+// return 1 if the sum is valid, 0 on overflow.
+// negative terms are considered invalid.
+static int stbi__addsizes_valid(int a, int b) {
+    if (b < 0)
+        return 0;
+    // now 0 <= b <= INT_MAX, hence also
+    // 0 <= INT_MAX - b <= INTMAX.
+    // And "a + b <= INT_MAX" (which might overflow) is the
+    // same as a <= INT_MAX - b (no overflow)
+    return a <= INT_MAX - b;
+}
+
+// returns 1 if the product is valid, 0 on overflow.
+// negative factors are considered invalid.
+static int stbi__mul2sizes_valid(int a, int b) {
+    if (a < 0 || b < 0)
+        return 0;
+    if (b == 0)
+        return 1; // mul-by-0 is always safe
+    // portable way to check for no overflows in a*b
+    return a <= INT_MAX / b;
+}
+
+#if !defined(STBI_NO_JPEG) || !defined(STBI_NO_PNG) || !defined(STBI_NO_TGA) || !defined(STBI_NO_HDR)
+// returns 1 if "a*b + add" has no negative terms/factors and doesn't overflow
+static int stbi__mad2sizes_valid(int a, int b, int add) {
+    return stbi__mul2sizes_valid(a, b) && stbi__addsizes_valid(a * b, add);
+}
+#endif
+
+// returns 1 if "a*b*c + add" has no negative terms/factors and doesn't overflow
+static int stbi__mad3sizes_valid(int a, int b, int c, int add) {
+    return stbi__mul2sizes_valid(a, b) && stbi__mul2sizes_valid(a * b, c) && stbi__addsizes_valid(a * b * c, add);
+}
+
+// returns 1 if "a*b*c*d + add" has no negative terms/factors and doesn't overflow
+#if !defined(STBI_NO_LINEAR) || !defined(STBI_NO_HDR) || !defined(STBI_NO_PNM)
+static int stbi__mad4sizes_valid(int a, int b, int c, int d, int add) {
+    return stbi__mul2sizes_valid(a, b) && stbi__mul2sizes_valid(a * b, c) && stbi__mul2sizes_valid(a * b * c, d) &&
+           stbi__addsizes_valid(a * b * c * d, add);
+}
+#endif
+
+#if !defined(STBI_NO_JPEG) || !defined(STBI_NO_PNG) || !defined(STBI_NO_TGA) || !defined(STBI_NO_HDR)
+// mallocs with size overflow checking
+static void * stbi__malloc_mad2(int a, int b, int add) {
+    if (!stbi__mad2sizes_valid(a, b, add))
+        return NULL;
+    return stbi__malloc(a * b + add);
+}
+#endif
+
+static void * stbi__malloc_mad3(int a, int b, int c, int add) {
+    if (!stbi__mad3sizes_valid(a, b, c, add))
+        return NULL;
+    return stbi__malloc(a * b * c + add);
+}
+
+#if !defined(STBI_NO_LINEAR) || !defined(STBI_NO_HDR) || !defined(STBI_NO_PNM)
+static void * stbi__malloc_mad4(int a, int b, int c, int d, int add) {
+    if (!stbi__mad4sizes_valid(a, b, c, d, add))
+        return NULL;
+    return stbi__malloc(a * b * c * d + add);
+}
+#endif
+
+// returns 1 if the sum of two signed ints is valid (between -2^31 and 2^31-1 inclusive), 0 on overflow.
+static int stbi__addints_valid(int a, int b) {
+    if ((a >= 0) != (b >= 0))
+        return 1; // a and b have different signs, so no overflow
+    if (a < 0 && b < 0)
+        return a >= INT_MIN - b; // same as a + b >= INT_MIN; INT_MIN - b cannot overflow since b < 0.
+    return a <= INT_MAX - b;
+}
+
+// returns 1 if the product of two signed shorts is valid, 0 on overflow.
+static int stbi__mul2shorts_valid(short a, short b) {
+    if (b == 0 || b == -1)
+        return 1; // multiplication by 0 is always 0; check for -1 so SHRT_MIN/b doesn't overflow
+    if ((a >= 0) == (b >= 0))
+        return a <= SHRT_MAX / b; // product is positive, so similar to mul2sizes_valid
+    if (b < 0)
+        return a <= SHRT_MIN / b; // same as a * b >= SHRT_MIN
+    return a >= SHRT_MIN / b;
+}
+
+// stbi__err - error
+// stbi__errpf - error returning pointer to float
+// stbi__errpuc - error returning pointer to unsigned char
+
+#ifdef STBI_NO_FAILURE_STRINGS
+#define stbi__err(x, y) 0
+#elif defined(STBI_FAILURE_USERMSG)
+#define stbi__err(x, y) stbi__err(y)
+#else
+#define stbi__err(x, y) stbi__err(x)
+#endif
+
+#define stbi__errpf(x, y) ((float *)(size_t)(stbi__err(x, y) ? NULL : NULL))
+#define stbi__errpuc(x, y) ((unsigned char *)(size_t)(stbi__err(x, y) ? NULL : NULL))
+
+STBIDEF void stbi_image_free(void * retval_from_stbi_load) { STBI_FREE(retval_from_stbi_load); }
+
+#ifndef STBI_NO_LINEAR
+static float * stbi__ldr_to_hdr(stbi_uc * data, int x, int y, int comp);
+#endif
+
+#ifndef STBI_NO_HDR
+static stbi_uc * stbi__hdr_to_ldr(float * data, int x, int y, int comp);
+#endif
+
+static int stbi__vertically_flip_on_load_global = 0;
+
+STBIDEF void stbi_set_flip_vertically_on_load(int flag_true_if_should_flip) {
+    stbi__vertically_flip_on_load_global = flag_true_if_should_flip;
+}
+
+#ifndef STBI_THREAD_LOCAL
+#define stbi__vertically_flip_on_load stbi__vertically_flip_on_load_global
+#else
+static STBI_THREAD_LOCAL int stbi__vertically_flip_on_load_local, stbi__vertically_flip_on_load_set;
+
+STBIDEF void stbi_set_flip_vertically_on_load_thread(int flag_true_if_should_flip) {
+    stbi__vertically_flip_on_load_local = flag_true_if_should_flip;
+    stbi__vertically_flip_on_load_set = 1;
+}
+
+#define stbi__vertically_flip_on_load                                                                                          \
+    (stbi__vertically_flip_on_load_set ? stbi__vertically_flip_on_load_local : stbi__vertically_flip_on_load_global)
+#endif // STBI_THREAD_LOCAL
+
+static void * stbi__load_main(stbi__context * s, int * x, int * y, int * comp, int req_comp, stbi__result_info * ri, int bpc) {
+    memset(ri, 0, sizeof(*ri));         // make sure it's initialized if we add new fields
+    ri->bits_per_channel = 8;           // default is 8 so most paths don't have to be changed
+    ri->channel_order = STBI_ORDER_RGB; // all current input & output are this, but this is here so we can add BGR order
+    ri->num_channels = 0;
+
+// test the formats with a very explicit header first (at least a FOURCC
+// or distinctive magic number first)
+#ifndef STBI_NO_PNG
+    if (stbi__png_test(s))
+        return stbi__png_load(s, x, y, comp, req_comp, ri);
+#endif
+#ifndef STBI_NO_BMP
+    if (stbi__bmp_test(s))
+        return stbi__bmp_load(s, x, y, comp, req_comp, ri);
+#endif
+#ifndef STBI_NO_GIF
+    if (stbi__gif_test(s))
+        return stbi__gif_load(s, x, y, comp, req_comp, ri);
+#endif
+#ifndef STBI_NO_PSD
+    if (stbi__psd_test(s))
+        return stbi__psd_load(s, x, y, comp, req_comp, ri, bpc);
+#else
+    STBI_NOTUSED(bpc);
+#endif
+#ifndef STBI_NO_PIC
+    if (stbi__pic_test(s))
+        return stbi__pic_load(s, x, y, comp, req_comp, ri);
+#endif
+
+// then the formats that can end up attempting to load with just 1 or 2
+// bytes matching expectations; these are prone to false positives, so
+// try them later
+#ifndef STBI_NO_JPEG
+    if (stbi__jpeg_test(s))
+        return stbi__jpeg_load(s, x, y, comp, req_comp, ri);
+#endif
+#ifndef STBI_NO_PNM
+    if (stbi__pnm_test(s))
+        return stbi__pnm_load(s, x, y, comp, req_comp, ri);
+#endif
+
+#ifndef STBI_NO_HDR
+    if (stbi__hdr_test(s)) {
+        float * hdr = stbi__hdr_load(s, x, y, comp, req_comp, ri);
+        return stbi__hdr_to_ldr(hdr, *x, *y, req_comp ? req_comp : *comp);
+    }
+#endif
+
+#ifndef STBI_NO_TGA
+    // test tga last because it's a crappy test!
+    if (stbi__tga_test(s))
+        return stbi__tga_load(s, x, y, comp, req_comp, ri);
+#endif
+
+    return stbi__errpuc("unknown image type", "Image not of any known type, or corrupt");
+}
+
+static stbi_uc * stbi__convert_16_to_8(stbi__uint16 * orig, int w, int h, int channels) {
+    int i;
+    int img_len = w * h * channels;
+    stbi_uc * reduced;
+
+    reduced = (stbi_uc *)stbi__malloc(img_len);
+    if (reduced == NULL)
+        return stbi__errpuc("outofmem", "Out of memory");
+
+    for (i = 0; i < img_len; ++i)
+        reduced[i] = (stbi_uc)((orig[i] >> 8) & 0xFF); // top half of each byte is sufficient approx of 16->8 bit scaling
+
+    STBI_FREE(orig);
+    return reduced;
+}
+
+static stbi__uint16 * stbi__convert_8_to_16(stbi_uc * orig, int w, int h, int channels) {
+    int i;
+    int img_len = w * h * channels;
+    stbi__uint16 * enlarged;
+
+    enlarged = (stbi__uint16 *)stbi__malloc(img_len * 2);
+    if (enlarged == NULL)
+        return (stbi__uint16 *)stbi__errpuc("outofmem", "Out of memory");
+
+    for (i = 0; i < img_len; ++i)
+        enlarged[i] = (stbi__uint16)((orig[i] << 8) + orig[i]); // replicate to high and low byte, maps 0->0, 255->0xffff
+
+    STBI_FREE(orig);
+    return enlarged;
+}
+
+static void stbi__vertical_flip(void * image, int w, int h, int bytes_per_pixel) {
+    int row;
+    size_t bytes_per_row = (size_t)w * bytes_per_pixel;
+    stbi_uc temp[2048];
+    stbi_uc * bytes = (stbi_uc *)image;
+
+    for (row = 0; row < (h >> 1); row++) {
+        stbi_uc * row0 = bytes + row * bytes_per_row;
+        stbi_uc * row1 = bytes + (h - row - 1) * bytes_per_row;
+        // swap row0 with row1
+        size_t bytes_left = bytes_per_row;
+        while (bytes_left) {
+            size_t bytes_copy = (bytes_left < sizeof(temp)) ? bytes_left : sizeof(temp);
+            memcpy(temp, row0, bytes_copy);
+            memcpy(row0, row1, bytes_copy);
+            memcpy(row1, temp, bytes_copy);
+            row0 += bytes_copy;
+            row1 += bytes_copy;
+            bytes_left -= bytes_copy;
+        }
+    }
+}
+
+#ifndef STBI_NO_GIF
+static void stbi__vertical_flip_slices(void * image, int w, int h, int z, int bytes_per_pixel) {
+    int slice;
+    int slice_size = w * h * bytes_per_pixel;
+
+    stbi_uc * bytes = (stbi_uc *)image;
+    for (slice = 0; slice < z; ++slice) {
+        stbi__vertical_flip(bytes, w, h, bytes_per_pixel);
+        bytes += slice_size;
+    }
+}
+#endif
+
+static unsigned char * stbi__load_and_postprocess_8bit(stbi__context * s, int * x, int * y, int * comp, int req_comp) {
+    stbi__result_info ri;
+    void * result = stbi__load_main(s, x, y, comp, req_comp, &ri, 8);
+
+    if (result == NULL)
+        return NULL;
+
+    // it is the responsibility of the loaders to make sure we get either 8 or 16 bit.
+    STBI_ASSERT(ri.bits_per_channel == 8 || ri.bits_per_channel == 16);
+
+    if (ri.bits_per_channel != 8) {
+        result = stbi__convert_16_to_8((stbi__uint16 *)result, *x, *y, req_comp == 0 ? *comp : req_comp);
+        ri.bits_per_channel = 8;
+    }
+
+    // @TODO: move stbi__convert_format to here
+
+    if (stbi__vertically_flip_on_load) {
+        int channels = req_comp ? req_comp : *comp;
+        stbi__vertical_flip(result, *x, *y, channels * sizeof(stbi_uc));
+    }
+
+    return (unsigned char *)result;
+}
+
+static stbi__uint16 * stbi__load_and_postprocess_16bit(stbi__context * s, int * x, int * y, int * comp, int req_comp) {
+    stbi__result_info ri;
+    void * result = stbi__load_main(s, x, y, comp, req_comp, &ri, 16);
+
+    if (result == NULL)
+        return NULL;
+
+    // it is the responsibility of the loaders to make sure we get either 8 or 16 bit.
+    STBI_ASSERT(ri.bits_per_channel == 8 || ri.bits_per_channel == 16);
+
+    if (ri.bits_per_channel != 16) {
+        result = stbi__convert_8_to_16((stbi_uc *)result, *x, *y, req_comp == 0 ? *comp : req_comp);
+        ri.bits_per_channel = 16;
+    }
+
+    // @TODO: move stbi__convert_format16 to here
+    // @TODO: special case RGB-to-Y (and RGBA-to-YA) for 8-bit-to-16-bit case to keep more precision
+
+    if (stbi__vertically_flip_on_load) {
+        int channels = req_comp ? req_comp : *comp;
+        stbi__vertical_flip(result, *x, *y, channels * sizeof(stbi__uint16));
+    }
+
+    return (stbi__uint16 *)result;
+}
+
+#if !defined(STBI_NO_HDR) && !defined(STBI_NO_LINEAR)
+static void stbi__float_postprocess(float * result, int * x, int * y, int * comp, int req_comp) {
+    if (stbi__vertically_flip_on_load && result != NULL) {
+        int channels = req_comp ? req_comp : *comp;
+        stbi__vertical_flip(result, *x, *y, channels * sizeof(float));
+    }
+}
+#endif
+
+#ifndef STBI_NO_STDIO
+
+#if defined(_WIN32) && defined(STBI_WINDOWS_UTF8)
+STBI_EXTERN __declspec(dllimport) int __stdcall MultiByteToWideChar(unsigned int cp, unsigned long flags, const char * str,
+                                                                    int cbmb, wchar_t * widestr, int cchwide);
+STBI_EXTERN __declspec(dllimport) int __stdcall WideCharToMultiByte(unsigned int cp, unsigned long flags,
+                                                                    const wchar_t * widestr, int cchwide, char * str, int cbmb,
+                                                                    const char * defchar, int * used_default);
+#endif
+
+#if defined(_WIN32) && defined(STBI_WINDOWS_UTF8)
+STBIDEF int stbi_convert_wchar_to_utf8(char * buffer, size_t bufferlen, const wchar_t * input) {
+    return WideCharToMultiByte(65001 /* UTF8 */, 0, input, -1, buffer, (int)bufferlen, NULL, NULL);
+}
+#endif
+
+static FILE * stbi__fopen(char const * filename, char const * mode) {
+    FILE * f;
+#if defined(_WIN32) && defined(STBI_WINDOWS_UTF8)
+    wchar_t wMode[64];
+    wchar_t wFilename[1024];
+    if (0 == MultiByteToWideChar(65001 /* UTF8 */, 0, filename, -1, wFilename, sizeof(wFilename) / sizeof(*wFilename)))
+        return 0;
+
+    if (0 == MultiByteToWideChar(65001 /* UTF8 */, 0, mode, -1, wMode, sizeof(wMode) / sizeof(*wMode)))
+        return 0;
+
+#if defined(_MSC_VER) && _MSC_VER >= 1400
+    if (0 != _wfopen_s(&f, wFilename, wMode))
+        f = 0;
+#else
+    f = _wfopen(wFilename, wMode);
+#endif
+
+#elif defined(_MSC_VER) && _MSC_VER >= 1400
+    if (0 != fopen_s(&f, filename, mode))
+        f = 0;
+#else
+    f = fopen(filename, mode);
+#endif
+    return f;
+}
+
+STBIDEF stbi_uc * stbi_load(char const * filename, int * x, int * y, int * comp, int req_comp) {
+    FILE * f = stbi__fopen(filename, "rb");
+    unsigned char * result;
+    if (!f)
+        return stbi__errpuc("can't fopen", "Unable to open file");
+    result = stbi_load_from_file(f, x, y, comp, req_comp);
+    fclose(f);
+    return result;
+}
+
+STBIDEF stbi_uc * stbi_load_from_file(FILE * f, int * x, int * y, int * comp, int req_comp) {
+    unsigned char * result;
+    stbi__context s;
+    stbi__start_file(&s, f);
+    result = stbi__load_and_postprocess_8bit(&s, x, y, comp, req_comp);
+    if (result) {
+        // need to 'unget' all the characters in the IO buffer
+        fseek(f, -(int)(s.img_buffer_end - s.img_buffer), SEEK_CUR);
+    }
+    return result;
+}
+
+STBIDEF stbi__uint16 * stbi_load_from_file_16(FILE * f, int * x, int * y, int * comp, int req_comp) {
+    stbi__uint16 * result;
+    stbi__context s;
+    stbi__start_file(&s, f);
+    result = stbi__load_and_postprocess_16bit(&s, x, y, comp, req_comp);
+    if (result) {
+        // need to 'unget' all the characters in the IO buffer
+        fseek(f, -(int)(s.img_buffer_end - s.img_buffer), SEEK_CUR);
+    }
+    return result;
+}
+
+STBIDEF stbi_us * stbi_load_16(char const * filename, int * x, int * y, int * comp, int req_comp) {
+    FILE * f = stbi__fopen(filename, "rb");
+    stbi__uint16 * result;
+    if (!f)
+        return (stbi_us *)stbi__errpuc("can't fopen", "Unable to open file");
+    result = stbi_load_from_file_16(f, x, y, comp, req_comp);
+    fclose(f);
+    return result;
+}
+
+#endif //! STBI_NO_STDIO
+
+STBIDEF stbi_us * stbi_load_16_from_memory(stbi_uc const * buffer, int len, int * x, int * y, int * channels_in_file,
+                                           int desired_channels) {
+    stbi__context s;
+    stbi__start_mem(&s, buffer, len);
+    return stbi__load_and_postprocess_16bit(&s, x, y, channels_in_file, desired_channels);
+}
+
+STBIDEF stbi_us * stbi_load_16_from_callbacks(stbi_io_callbacks const * clbk, void * user, int * x, int * y,
+                                              int * channels_in_file, int desired_channels) {
+    stbi__context s;
+    stbi__start_callbacks(&s, (stbi_io_callbacks *)clbk, user);
+    return stbi__load_and_postprocess_16bit(&s, x, y, channels_in_file, desired_channels);
+}
+
+STBIDEF stbi_uc * stbi_load_from_memory(stbi_uc const * buffer, int len, int * x, int * y, int * comp, int req_comp) {
+    stbi__context s;
+    stbi__start_mem(&s, buffer, len);
+    return stbi__load_and_postprocess_8bit(&s, x, y, comp, req_comp);
+}
+
+STBIDEF stbi_uc * stbi_load_from_callbacks(stbi_io_callbacks const * clbk, void * user, int * x, int * y, int * comp,
+                                           int req_comp) {
+    stbi__context s;
+    stbi__start_callbacks(&s, (stbi_io_callbacks *)clbk, user);
+    return stbi__load_and_postprocess_8bit(&s, x, y, comp, req_comp);
+}
+
+#ifndef STBI_NO_GIF
+STBIDEF stbi_uc * stbi_load_gif_from_memory(stbi_uc const * buffer, int len, int ** delays, int * x, int * y, int * z,
+                                            int * comp, int req_comp) {
+    unsigned char * result;
+    stbi__context s;
+    stbi__start_mem(&s, buffer, len);
+
+    result = (unsigned char *)stbi__load_gif_main(&s, delays, x, y, z, comp, req_comp);
+    if (stbi__vertically_flip_on_load) {
+        stbi__vertical_flip_slices(result, *x, *y, *z, *comp);
+    }
+
+    return result;
+}
+#endif
+
+#ifndef STBI_NO_LINEAR
+static float * stbi__loadf_main(stbi__context * s, int * x, int * y, int * comp, int req_comp) {
+    unsigned char * data;
+#ifndef STBI_NO_HDR
+    if (stbi__hdr_test(s)) {
+        stbi__result_info ri;
+        float * hdr_data = stbi__hdr_load(s, x, y, comp, req_comp, &ri);
+        if (hdr_data)
+            stbi__float_postprocess(hdr_data, x, y, comp, req_comp);
+        return hdr_data;
+    }
+#endif
+    data = stbi__load_and_postprocess_8bit(s, x, y, comp, req_comp);
+    if (data)
+        return stbi__ldr_to_hdr(data, *x, *y, req_comp ? req_comp : *comp);
+    return stbi__errpf("unknown image type", "Image not of any known type, or corrupt");
+}
+
+STBIDEF float * stbi_loadf_from_memory(stbi_uc const * buffer, int len, int * x, int * y, int * comp, int req_comp) {
+    stbi__context s;
+    stbi__start_mem(&s, buffer, len);
+    return stbi__loadf_main(&s, x, y, comp, req_comp);
+}
+
+STBIDEF float * stbi_loadf_from_callbacks(stbi_io_callbacks const * clbk, void * user, int * x, int * y, int * comp,
+                                          int req_comp) {
+    stbi__context s;
+    stbi__start_callbacks(&s, (stbi_io_callbacks *)clbk, user);
+    return stbi__loadf_main(&s, x, y, comp, req_comp);
+}
+
+#ifndef STBI_NO_STDIO
+STBIDEF float * stbi_loadf(char const * filename, int * x, int * y, int * comp, int req_comp) {
+    float * result;
+    FILE * f = stbi__fopen(filename, "rb");
+    if (!f)
+        return stbi__errpf("can't fopen", "Unable to open file");
+    result = stbi_loadf_from_file(f, x, y, comp, req_comp);
+    fclose(f);
+    return result;
+}
+
+STBIDEF float * stbi_loadf_from_file(FILE * f, int * x, int * y, int * comp, int req_comp) {
+    stbi__context s;
+    stbi__start_file(&s, f);
+    return stbi__loadf_main(&s, x, y, comp, req_comp);
+}
+#endif // !STBI_NO_STDIO
+
+#endif // !STBI_NO_LINEAR
+
+// these is-hdr-or-not is defined independent of whether STBI_NO_LINEAR is
+// defined, for API simplicity; if STBI_NO_LINEAR is defined, it always
+// reports false!
+
+STBIDEF int stbi_is_hdr_from_memory(stbi_uc const * buffer, int len) {
+#ifndef STBI_NO_HDR
+    stbi__context s;
+    stbi__start_mem(&s, buffer, len);
+    return stbi__hdr_test(&s);
+#else
+    STBI_NOTUSED(buffer);
+    STBI_NOTUSED(len);
+    return 0;
+#endif
+}
+
+#ifndef STBI_NO_STDIO
+STBIDEF int stbi_is_hdr(char const * filename) {
+    FILE * f = stbi__fopen(filename, "rb");
+    int result = 0;
+    if (f) {
+        result = stbi_is_hdr_from_file(f);
+        fclose(f);
+    }
+    return result;
+}
+
+STBIDEF int stbi_is_hdr_from_file(FILE * f) {
+#ifndef STBI_NO_HDR
+    long pos = ftell(f);
+    int res;
+    stbi__context s;
+    stbi__start_file(&s, f);
+    res = stbi__hdr_test(&s);
+    fseek(f, pos, SEEK_SET);
+    return res;
+#else
+    STBI_NOTUSED(f);
+    return 0;
+#endif
+}
+#endif // !STBI_NO_STDIO
+
+STBIDEF int stbi_is_hdr_from_callbacks(stbi_io_callbacks const * clbk, void * user) {
+#ifndef STBI_NO_HDR
+    stbi__context s;
+    stbi__start_callbacks(&s, (stbi_io_callbacks *)clbk, user);
+    return stbi__hdr_test(&s);
+#else
+    STBI_NOTUSED(clbk);
+    STBI_NOTUSED(user);
+    return 0;
+#endif
+}
+
+#ifndef STBI_NO_LINEAR
+static float stbi__l2h_gamma = 2.2f, stbi__l2h_scale = 1.0f;
+
+STBIDEF void stbi_ldr_to_hdr_gamma(float gamma) { stbi__l2h_gamma = gamma; }
+STBIDEF void stbi_ldr_to_hdr_scale(float scale) { stbi__l2h_scale = scale; }
+#endif
+
+static float stbi__h2l_gamma_i = 1.0f / 2.2f, stbi__h2l_scale_i = 1.0f;
+
+STBIDEF void stbi_hdr_to_ldr_gamma(float gamma) { stbi__h2l_gamma_i = 1 / gamma; }
+STBIDEF void stbi_hdr_to_ldr_scale(float scale) { stbi__h2l_scale_i = 1 / scale; }
+
+//////////////////////////////////////////////////////////////////////////////
+//
+// Common code used by all image loaders
+//
+
+enum { STBI__SCAN_load = 0, STBI__SCAN_type, STBI__SCAN_header };
+
+static void stbi__refill_buffer(stbi__context * s) {
+    int n = (s->io.read)(s->io_user_data, (char *)s->buffer_start, s->buflen);
+    s->callback_already_read += (int)(s->img_buffer - s->img_buffer_original);
+    if (n == 0) {
+        // at end of file, treat same as if from memory, but need to handle case
+        // where s->img_buffer isn't pointing to safe memory, e.g. 0-byte file
+        s->read_from_callbacks = 0;
+        s->img_buffer = s->buffer_start;
+        s->img_buffer_end = s->buffer_start + 1;
+        *s->img_buffer = 0;
+    } else {
+        s->img_buffer = s->buffer_start;
+        s->img_buffer_end = s->buffer_start + n;
+    }
+}
+
+stbi_inline static stbi_uc stbi__get8(stbi__context * s) {
+    if (s->img_buffer < s->img_buffer_end)
+        return *s->img_buffer++;
+    if (s->read_from_callbacks) {
+        stbi__refill_buffer(s);
+        return *s->img_buffer++;
+    }
+    return 0;
+}
+
+#if defined(STBI_NO_JPEG) && defined(STBI_NO_HDR) && defined(STBI_NO_PIC) && defined(STBI_NO_PNM)
+// nothing
+#else
+stbi_inline static int stbi__at_eof(stbi__context * s) {
+    if (s->io.read) {
+        if (!(s->io.eof)(s->io_user_data))
+            return 0;
+        // if feof() is true, check if buffer = end
+        // special case: we've only got the special 0 character at the end
+        if (s->read_from_callbacks == 0)
+            return 1;
+    }
+
+    return s->img_buffer >= s->img_buffer_end;
+}
+#endif
+
+#if defined(STBI_NO_JPEG) && defined(STBI_NO_PNG) && defined(STBI_NO_BMP) && defined(STBI_NO_PSD) && defined(STBI_NO_TGA) &&   \
+    defined(STBI_NO_GIF) && defined(STBI_NO_PIC)
+// nothing
+#else
+static void stbi__skip(stbi__context * s, int n) {
+    if (n == 0)
+        return; // already there!
+    if (n < 0) {
+        s->img_buffer = s->img_buffer_end;
+        return;
+    }
+    if (s->io.read) {
+        int blen = (int)(s->img_buffer_end - s->img_buffer);
+        if (blen < n) {
+            s->img_buffer = s->img_buffer_end;
+            (s->io.skip)(s->io_user_data, n - blen);
+            return;
+        }
+    }
+    s->img_buffer += n;
+}
+#endif
+
+#if defined(STBI_NO_PNG) && defined(STBI_NO_TGA) && defined(STBI_NO_HDR) && defined(STBI_NO_PNM)
+// nothing
+#else
+static int stbi__getn(stbi__context * s, stbi_uc * buffer, int n) {
+    if (s->io.read) {
+        int blen = (int)(s->img_buffer_end - s->img_buffer);
+        if (blen < n) {
+            int res, count;
+
+            memcpy(buffer, s->img_buffer, blen);
+
+            count = (s->io.read)(s->io_user_data, (char *)buffer + blen, n - blen);
+            res = (count == (n - blen));
+            s->img_buffer = s->img_buffer_end;
+            return res;
+        }
+    }
+
+    if (s->img_buffer + n <= s->img_buffer_end) {
+        memcpy(buffer, s->img_buffer, n);
+        s->img_buffer += n;
+        return 1;
+    } else
+        return 0;
+}
+#endif
+
+#if defined(STBI_NO_JPEG) && defined(STBI_NO_PNG) && defined(STBI_NO_PSD) && defined(STBI_NO_PIC)
+// nothing
+#else
+static int stbi__get16be(stbi__context * s) {
+    int z = stbi__get8(s);
+    return (z << 8) + stbi__get8(s);
+}
+#endif
+
+#if defined(STBI_NO_PNG) && defined(STBI_NO_PSD) && defined(STBI_NO_PIC)
+// nothing
+#else
+static stbi__uint32 stbi__get32be(stbi__context * s) {
+    stbi__uint32 z = stbi__get16be(s);
+    return (z << 16) + stbi__get16be(s);
+}
+#endif
+
+#if defined(STBI_NO_BMP) && defined(STBI_NO_TGA) && defined(STBI_NO_GIF)
+// nothing
+#else
+static int stbi__get16le(stbi__context * s) {
+    int z = stbi__get8(s);
+    return z + (stbi__get8(s) << 8);
+}
+#endif
+
+#ifndef STBI_NO_BMP
+static stbi__uint32 stbi__get32le(stbi__context * s) {
+    stbi__uint32 z = stbi__get16le(s);
+    z += (stbi__uint32)stbi__get16le(s) << 16;
+    return z;
+}
+#endif
+
+#define STBI__BYTECAST(x) ((stbi_uc)((x)&255)) // truncate int to byte without warnings
+
+#if defined(STBI_NO_JPEG) && defined(STBI_NO_PNG) && defined(STBI_NO_BMP) && defined(STBI_NO_PSD) && defined(STBI_NO_TGA) &&   \
+    defined(STBI_NO_GIF) && defined(STBI_NO_PIC) && defined(STBI_NO_PNM)
+// nothing
+#else
+//////////////////////////////////////////////////////////////////////////////
+//
+//  generic converter from built-in img_n to req_comp
+//    individual types do this automatically as much as possible (e.g. jpeg
+//    does all cases internally since it needs to colorspace convert anyway,
+//    and it never has alpha, so very few cases ). png can automatically
+//    interleave an alpha=255 channel, but falls back to this for other cases
+//
+//  assume data buffer is malloced, so malloc a new one and free that one
+//  only failure mode is malloc failing
+
+static stbi_uc stbi__compute_y(int r, int g, int b) { return (stbi_uc)(((r * 77) + (g * 150) + (29 * b)) >> 8); }
+#endif
+
+#if defined(STBI_NO_PNG) && defined(STBI_NO_BMP) && defined(STBI_NO_PSD) && defined(STBI_NO_TGA) && defined(STBI_NO_GIF) &&    \
+    defined(STBI_NO_PIC) && defined(STBI_NO_PNM)
+// nothing
+#else
+static unsigned char * stbi__convert_format(unsigned char * data, int img_n, int req_comp, unsigned int x, unsigned int y) {
+    int i, j;
+    unsigned char * good;
+
+    if (req_comp == img_n)
+        return data;
+    STBI_ASSERT(req_comp >= 1 && req_comp <= 4);
+
+    good = (unsigned char *)stbi__malloc_mad3(req_comp, x, y, 0);
+    if (good == NULL) {
+        STBI_FREE(data);
+        return stbi__errpuc("outofmem", "Out of memory");
+    }
+
+    for (j = 0; j < (int)y; ++j) {
+        unsigned char * src = data + j * x * img_n;
+        unsigned char * dest = good + j * x * req_comp;
+
+#define STBI__COMBO(a, b) ((a)*8 + (b))
+#define STBI__CASE(a, b)                                                                                                       \
+    case STBI__COMBO(a, b):                                                                                                    \
+        for (i = x - 1; i >= 0; --i, src += a, dest += b)
+        // convert source image with img_n components to one with req_comp components;
+        // avoid switch per pixel, so use switch per scanline and massive macros
+        switch (STBI__COMBO(img_n, req_comp)) {
+            STBI__CASE(1, 2) {
+                dest[0] = src[0];
+                dest[1] = 255;
+            }
+            break;
+            STBI__CASE(1, 3) { dest[0] = dest[1] = dest[2] = src[0]; }
+            break;
+            STBI__CASE(1, 4) {
+                dest[0] = dest[1] = dest[2] = src[0];
+                dest[3] = 255;
+            }
+            break;
+            STBI__CASE(2, 1) { dest[0] = src[0]; }
+            break;
+            STBI__CASE(2, 3) { dest[0] = dest[1] = dest[2] = src[0]; }
+            break;
+            STBI__CASE(2, 4) {
+                dest[0] = dest[1] = dest[2] = src[0];
+                dest[3] = src[1];
+            }
+            break;
+            STBI__CASE(3, 4) {
+                dest[0] = src[0];
+                dest[1] = src[1];
+                dest[2] = src[2];
+                dest[3] = 255;
+            }
+            break;
+            STBI__CASE(3, 1) { dest[0] = stbi__compute_y(src[0], src[1], src[2]); }
+            break;
+            STBI__CASE(3, 2) {
+                dest[0] = stbi__compute_y(src[0], src[1], src[2]);
+                dest[1] = 255;
+            }
+            break;
+            STBI__CASE(4, 1) { dest[0] = stbi__compute_y(src[0], src[1], src[2]); }
+            break;
+            STBI__CASE(4, 2) {
+                dest[0] = stbi__compute_y(src[0], src[1], src[2]);
+                dest[1] = src[3];
+            }
+            break;
+            STBI__CASE(4, 3) {
+                dest[0] = src[0];
+                dest[1] = src[1];
+                dest[2] = src[2];
+            }
+            break;
+        default:
+            STBI_ASSERT(0);
+            STBI_FREE(data);
+            STBI_FREE(good);
+            return stbi__errpuc("unsupported", "Unsupported format conversion");
+        }
+#undef STBI__CASE
+    }
+
+    STBI_FREE(data);
+    return good;
+}
+#endif
+
+#if defined(STBI_NO_PNG) && defined(STBI_NO_PSD)
+// nothing
+#else
+static stbi__uint16 stbi__compute_y_16(int r, int g, int b) { return (stbi__uint16)(((r * 77) + (g * 150) + (29 * b)) >> 8); }
+#endif
+
+#if defined(STBI_NO_PNG) && defined(STBI_NO_PSD)
+// nothing
+#else
+static stbi__uint16 * stbi__convert_format16(stbi__uint16 * data, int img_n, int req_comp, unsigned int x, unsigned int y) {
+    int i, j;
+    stbi__uint16 * good;
+
+    if (req_comp == img_n)
+        return data;
+    STBI_ASSERT(req_comp >= 1 && req_comp <= 4);
+
+    good = (stbi__uint16 *)stbi__malloc(req_comp * x * y * 2);
+    if (good == NULL) {
+        STBI_FREE(data);
+        return (stbi__uint16 *)stbi__errpuc("outofmem", "Out of memory");
+    }
+
+    for (j = 0; j < (int)y; ++j) {
+        stbi__uint16 * src = data + j * x * img_n;
+        stbi__uint16 * dest = good + j * x * req_comp;
+
+#define STBI__COMBO(a, b) ((a)*8 + (b))
+#define STBI__CASE(a, b)                                                                                                       \
+    case STBI__COMBO(a, b):                                                                                                    \
+        for (i = x - 1; i >= 0; --i, src += a, dest += b)
+        // convert source image with img_n components to one with req_comp components;
+        // avoid switch per pixel, so use switch per scanline and massive macros
+        switch (STBI__COMBO(img_n, req_comp)) {
+            STBI__CASE(1, 2) {
+                dest[0] = src[0];
+                dest[1] = 0xffff;
+            }
+            break;
+            STBI__CASE(1, 3) { dest[0] = dest[1] = dest[2] = src[0]; }
+            break;
+            STBI__CASE(1, 4) {
+                dest[0] = dest[1] = dest[2] = src[0];
+                dest[3] = 0xffff;
+            }
+            break;
+            STBI__CASE(2, 1) { dest[0] = src[0]; }
+            break;
+            STBI__CASE(2, 3) { dest[0] = dest[1] = dest[2] = src[0]; }
+            break;
+            STBI__CASE(2, 4) {
+                dest[0] = dest[1] = dest[2] = src[0];
+                dest[3] = src[1];
+            }
+            break;
+            STBI__CASE(3, 4) {
+                dest[0] = src[0];
+                dest[1] = src[1];
+                dest[2] = src[2];
+                dest[3] = 0xffff;
+            }
+            break;
+            STBI__CASE(3, 1) { dest[0] = stbi__compute_y_16(src[0], src[1], src[2]); }
+            break;
+            STBI__CASE(3, 2) {
+                dest[0] = stbi__compute_y_16(src[0], src[1], src[2]);
+                dest[1] = 0xffff;
+            }
+            break;
+            STBI__CASE(4, 1) { dest[0] = stbi__compute_y_16(src[0], src[1], src[2]); }
+            break;
+            STBI__CASE(4, 2) {
+                dest[0] = stbi__compute_y_16(src[0], src[1], src[2]);
+                dest[1] = src[3];
+            }
+            break;
+            STBI__CASE(4, 3) {
+                dest[0] = src[0];
+                dest[1] = src[1];
+                dest[2] = src[2];
+            }
+            break;
+        default:
+            STBI_ASSERT(0);
+            STBI_FREE(data);
+            STBI_FREE(good);
+            return (stbi__uint16 *)stbi__errpuc("unsupported", "Unsupported format conversion");
+        }
+#undef STBI__CASE
+    }
+
+    STBI_FREE(data);
+    return good;
+}
+#endif
+
+#ifndef STBI_NO_LINEAR
+static float * stbi__ldr_to_hdr(stbi_uc * data, int x, int y, int comp) {
+    int i, k, n;
+    float * output;
+    if (!data)
+        return NULL;
+    output = (float *)stbi__malloc_mad4(x, y, comp, sizeof(float), 0);
+    if (output == NULL) {
+        STBI_FREE(data);
+        return stbi__errpf("outofmem", "Out of memory");
+    }
+    // compute number of non-alpha components
+    if (comp & 1)
+        n = comp;
+    else
+        n = comp - 1;
+    for (i = 0; i < x * y; ++i) {
+        for (k = 0; k < n; ++k) {
+            output[i * comp + k] = (float)(pow(data[i * comp + k] / 255.0f, stbi__l2h_gamma) * stbi__l2h_scale);
+        }
+    }
+    if (n < comp) {
+        for (i = 0; i < x * y; ++i) {
+            output[i * comp + n] = data[i * comp + n] / 255.0f;
+        }
+    }
+    STBI_FREE(data);
+    return output;
+}
+#endif
+
+#ifndef STBI_NO_HDR
+#define stbi__float2int(x) ((int)(x))
+static stbi_uc * stbi__hdr_to_ldr(float * data, int x, int y, int comp) {
+    int i, k, n;
+    stbi_uc * output;
+    if (!data)
+        return NULL;
+    output = (stbi_uc *)stbi__malloc_mad3(x, y, comp, 0);
+    if (output == NULL) {
+        STBI_FREE(data);
+        return stbi__errpuc("outofmem", "Out of memory");
+    }
+    // compute number of non-alpha components
+    if (comp & 1)
+        n = comp;
+    else
+        n = comp - 1;
+    for (i = 0; i < x * y; ++i) {
+        for (k = 0; k < n; ++k) {
+            float z = (float)pow(data[i * comp + k] * stbi__h2l_scale_i, stbi__h2l_gamma_i) * 255 + 0.5f;
+            if (z < 0)
+                z = 0;
+            if (z > 255)
+                z = 255;
+            output[i * comp + k] = (stbi_uc)stbi__float2int(z);
+        }
+        if (k < comp) {
+            float z = data[i * comp + k] * 255 + 0.5f;
+            if (z < 0)
+                z = 0;
+            if (z > 255)
+                z = 255;
+            output[i * comp + k] = (stbi_uc)stbi__float2int(z);
+        }
+    }
+    STBI_FREE(data);
+    return output;
+}
+#endif
+
+//////////////////////////////////////////////////////////////////////////////
+//
+//  "baseline" JPEG/JFIF decoder
+//
+//    simple implementation
+//      - doesn't support delayed output of y-dimension
+//      - simple interface (only one output format: 8-bit interleaved RGB)
+//      - doesn't try to recover corrupt jpegs
+//      - doesn't allow partial loading, loading multiple at once
+//      - still fast on x86 (copying globals into locals doesn't help x86)
+//      - allocates lots of intermediate memory (full size of all components)
+//        - non-interleaved case requires this anyway
+//        - allows good upsampling (see next)
+//    high-quality
+//      - upsampled channels are bilinearly interpolated, even across blocks
+//      - quality integer IDCT derived from IJG's 'slow'
+//    performance
+//      - fast huffman; reasonable integer IDCT
+//      - some SIMD kernels for common paths on targets with SSE2/NEON
+//      - uses a lot of intermediate memory, could cache poorly
+
+#ifndef STBI_NO_JPEG
+
+// huffman decoding acceleration
+#define FAST_BITS 9 // larger handles more cases; smaller stomps less cache
+
+typedef struct {
+    stbi_uc fast[1 << FAST_BITS];
+    // weirdly, repacking this into AoS is a 10% speed loss, instead of a win
+    stbi__uint16 code[256];
+    stbi_uc values[256];
+    stbi_uc size[257];
+    unsigned int maxcode[18];
+    int delta[17]; // old 'firstsymbol' - old 'firstcode'
+} stbi__huffman;
+
+typedef struct {
+    stbi__context * s;
+    stbi__huffman huff_dc[4];
+    stbi__huffman huff_ac[4];
+    stbi__uint16 dequant[4][64];
+    stbi__int16 fast_ac[4][1 << FAST_BITS];
+
+    // sizes for components, interleaved MCUs
+    int img_h_max, img_v_max;
+    int img_mcu_x, img_mcu_y;
+    int img_mcu_w, img_mcu_h;
+
+    // definition of jpeg image component
+    struct {
+        int id;
+        int h, v;
+        int tq;
+        int hd, ha;
+        int dc_pred;
+
+        int x, y, w2, h2;
+        stbi_uc * data;
+        void *raw_data, *raw_coeff;
+        stbi_uc * linebuf;
+        short * coeff;        // progressive only
+        int coeff_w, coeff_h; // number of 8x8 coefficient blocks
+    } img_comp[4];
+
+    stbi__uint32 code_buffer; // jpeg entropy-coded buffer
+    int code_bits;            // number of valid bits
+    unsigned char marker;     // marker seen while filling entropy buffer
+    int nomore;               // flag if we saw a marker so must stop
+
+    int progressive;
+    int spec_start;
+    int spec_end;
+    int succ_high;
+    int succ_low;
+    int eob_run;
+    int jfif;
+    int app14_color_transform; // Adobe APP14 tag
+    int rgb;
+
+    int scan_n, order[4];
+    int restart_interval, todo;
+
+    // kernels
+    void (*idct_block_kernel)(stbi_uc * out, int out_stride, short data[64]);
+    void (*YCbCr_to_RGB_kernel)(stbi_uc * out, const stbi_uc * y, const stbi_uc * pcb, const stbi_uc * pcr, int count,
+                                int step);
+    stbi_uc * (*resample_row_hv_2_kernel)(stbi_uc * out, stbi_uc * in_near, stbi_uc * in_far, int w, int hs);
+} stbi__jpeg;
+
+static int stbi__build_huffman(stbi__huffman * h, int * count) {
+    int i, j, k = 0;
+    unsigned int code;
+    // build size list for each symbol (from JPEG spec)
+    for (i = 0; i < 16; ++i) {
+        for (j = 0; j < count[i]; ++j) {
+            h->size[k++] = (stbi_uc)(i + 1);
+            if (k >= 257)
+                return stbi__err("bad size list", "Corrupt JPEG");
+        }
+    }
+    h->size[k] = 0;
+
+    // compute actual symbols (from jpeg spec)
+    code = 0;
+    k = 0;
+    for (j = 1; j <= 16; ++j) {
+        // compute delta to add to code to compute symbol id
+        h->delta[j] = k - code;
+        if (h->size[k] == j) {
+            while (h->size[k] == j)
+                h->code[k++] = (stbi__uint16)(code++);
+            if (code - 1 >= (1u << j))
+                return stbi__err("bad code lengths", "Corrupt JPEG");
+        }
+        // compute largest code + 1 for this size, preshifted as needed later
+        h->maxcode[j] = code << (16 - j);
+        code <<= 1;
+    }
+    h->maxcode[j] = 0xffffffff;
+
+    // build non-spec acceleration table; 255 is flag for not-accelerated
+    memset(h->fast, 255, 1 << FAST_BITS);
+    for (i = 0; i < k; ++i) {
+        int s = h->size[i];
+        if (s <= FAST_BITS) {
+            int c = h->code[i] << (FAST_BITS - s);
+            int m = 1 << (FAST_BITS - s);
+            for (j = 0; j < m; ++j) {
+                h->fast[c + j] = (stbi_uc)i;
+            }
+        }
+    }
+    return 1;
+}
+
+// build a table that decodes both magnitude and value of small ACs in
+// one go.
+static void stbi__build_fast_ac(stbi__int16 * fast_ac, stbi__huffman * h) {
+    int i;
+    for (i = 0; i < (1 << FAST_BITS); ++i) {
+        stbi_uc fast = h->fast[i];
+        fast_ac[i] = 0;
+        if (fast < 255) {
+            int rs = h->values[fast];
+            int run = (rs >> 4) & 15;
+            int magbits = rs & 15;
+            int len = h->size[fast];
+
+            if (magbits && len + magbits <= FAST_BITS) {
+                // magnitude code followed by receive_extend code
+                int k = ((i << len) & ((1 << FAST_BITS) - 1)) >> (FAST_BITS - magbits);
+                int m = 1 << (magbits - 1);
+                if (k < m)
+                    k += (~0U << magbits) + 1;
+                // if the result is small enough, we can fit it in fast_ac table
+                if (k >= -128 && k <= 127)
+                    fast_ac[i] = (stbi__int16)((k * 256) + (run * 16) + (len + magbits));
+            }
+        }
+    }
+}
+
+static void stbi__grow_buffer_unsafe(stbi__jpeg * j) {
+    do {
+        unsigned int b = j->nomore ? 0 : stbi__get8(j->s);
+        if (b == 0xff) {
+            int c = stbi__get8(j->s);
+            while (c == 0xff)
+                c = stbi__get8(j->s); // consume fill bytes
+            if (c != 0) {
+                j->marker = (unsigned char)c;
+                j->nomore = 1;
+                return;
+            }
+        }
+        j->code_buffer |= b << (24 - j->code_bits);
+        j->code_bits += 8;
+    } while (j->code_bits <= 24);
+}
+
+// (1 << n) - 1
+static const stbi__uint32 stbi__bmask[17] = {0,   1,    3,    7,    15,   31,    63,    127,  255,
+                                             511, 1023, 2047, 4095, 8191, 16383, 32767, 65535};
+
+// decode a jpeg huffman value from the bitstream
+stbi_inline static int stbi__jpeg_huff_decode(stbi__jpeg * j, stbi__huffman * h) {
+    unsigned int temp;
+    int c, k;
+
+    if (j->code_bits < 16)
+        stbi__grow_buffer_unsafe(j);
+
+    // look at the top FAST_BITS and determine what symbol ID it is,
+    // if the code is <= FAST_BITS
+    c = (j->code_buffer >> (32 - FAST_BITS)) & ((1 << FAST_BITS) - 1);
+    k = h->fast[c];
+    if (k < 255) {
+        int s = h->size[k];
+        if (s > j->code_bits)
+            return -1;
+        j->code_buffer <<= s;
+        j->code_bits -= s;
+        return h->values[k];
+    }
+
+    // naive test is to shift the code_buffer down so k bits are
+    // valid, then test against maxcode. To speed this up, we've
+    // preshifted maxcode left so that it has (16-k) 0s at the
+    // end; in other words, regardless of the number of bits, it
+    // wants to be compared against something shifted to have 16;
+    // that way we don't need to shift inside the loop.
+    temp = j->code_buffer >> 16;
+    for (k = FAST_BITS + 1;; ++k)
+        if (temp < h->maxcode[k])
+            break;
+    if (k == 17) {
+        // error! code not found
+        j->code_bits -= 16;
+        return -1;
+    }
+
+    if (k > j->code_bits)
+        return -1;
+
+    // convert the huffman code to the symbol id
+    c = ((j->code_buffer >> (32 - k)) & stbi__bmask[k]) + h->delta[k];
+    if (c < 0 || c >= 256) // symbol id out of bounds!
+        return -1;
+    STBI_ASSERT((((j->code_buffer) >> (32 - h->size[c])) & stbi__bmask[h->size[c]]) == h->code[c]);
+
+    // convert the id to a symbol
+    j->code_bits -= k;
+    j->code_buffer <<= k;
+    return h->values[c];
+}
+
+// bias[n] = (-1<<n) + 1
+static const int stbi__jbias[16] = {0, -1, -3, -7, -15, -31, -63, -127, -255, -511, -1023, -2047, -4095, -8191, -16383, -32767};
+
+// combined JPEG 'receive' and JPEG 'extend', since baseline
+// always extends everything it receives.
+stbi_inline static int stbi__extend_receive(stbi__jpeg * j, int n) {
+    unsigned int k;
+    int sgn;
+    if (j->code_bits < n)
+        stbi__grow_buffer_unsafe(j);
+    if (j->code_bits < n)
+        return 0; // ran out of bits from stream, return 0s intead of continuing
+
+    sgn = j->code_buffer >> 31; // sign bit always in MSB; 0 if MSB clear (positive), 1 if MSB set (negative)
+    k = stbi_lrot(j->code_buffer, n);
+    j->code_buffer = k & ~stbi__bmask[n];
+    k &= stbi__bmask[n];
+    j->code_bits -= n;
+    return k + (stbi__jbias[n] & (sgn - 1));
+}
+
+// get some unsigned bits
+stbi_inline static int stbi__jpeg_get_bits(stbi__jpeg * j, int n) {
+    unsigned int k;
+    if (j->code_bits < n)
+        stbi__grow_buffer_unsafe(j);
+    if (j->code_bits < n)
+        return 0; // ran out of bits from stream, return 0s intead of continuing
+    k = stbi_lrot(j->code_buffer, n);
+    j->code_buffer = k & ~stbi__bmask[n];
+    k &= stbi__bmask[n];
+    j->code_bits -= n;
+    return k;
+}
+
+stbi_inline static int stbi__jpeg_get_bit(stbi__jpeg * j) {
+    unsigned int k;
+    if (j->code_bits < 1)
+        stbi__grow_buffer_unsafe(j);
+    if (j->code_bits < 1)
+        return 0; // ran out of bits from stream, return 0s intead of continuing
+    k = j->code_buffer;
+    j->code_buffer <<= 1;
+    --j->code_bits;
+    return k & 0x80000000;
+}
+
+// given a value that's at position X in the zigzag stream,
+// where does it appear in the 8x8 matrix coded as row-major?
+static const stbi_uc stbi__jpeg_dezigzag[64 + 15] = {
+    0, 1, 8, 16, 9, 2, 3, 10, 17, 24, 32, 25, 18, 11, 4, 5, 12, 19, 26, 33, 40, 48, 41, 34, 27, 20, 13, 6, 7, 14, 21, 28, 35,
+    42, 49, 56, 57, 50, 43, 36, 29, 22, 15, 23, 30, 37, 44, 51, 58, 59, 52, 45, 38, 31, 39, 46, 53, 60, 61, 54, 47, 55, 62, 63,
+    // let corrupt input sample past end
+    63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63};
+
+// decode one 64-entry block--
+static int stbi__jpeg_decode_block(stbi__jpeg * j, short data[64], stbi__huffman * hdc, stbi__huffman * hac, stbi__int16 * fac,
+                                   int b, stbi__uint16 * dequant) {
+    int diff, dc, k;
+    int t;
+
+    if (j->code_bits < 16)
+        stbi__grow_buffer_unsafe(j);
+    t = stbi__jpeg_huff_decode(j, hdc);
+    if (t < 0 || t > 15)
+        return stbi__err("bad huffman code", "Corrupt JPEG");
+
+    // 0 all the ac values now so we can do it 32-bits at a time
+    memset(data, 0, 64 * sizeof(data[0]));
+
+    diff = t ? stbi__extend_receive(j, t) : 0;
+    if (!stbi__addints_valid(j->img_comp[b].dc_pred, diff))
+        return stbi__err("bad delta", "Corrupt JPEG");
+    dc = j->img_comp[b].dc_pred + diff;
+    j->img_comp[b].dc_pred = dc;
+    if (!stbi__mul2shorts_valid(dc, dequant[0]))
+        return stbi__err("can't merge dc and ac", "Corrupt JPEG");
+    data[0] = (short)(dc * dequant[0]);
+
+    // decode AC components, see JPEG spec
+    k = 1;
+    do {
+        unsigned int zig;
+        int c, r, s;
+        if (j->code_bits < 16)
+            stbi__grow_buffer_unsafe(j);
+        c = (j->code_buffer >> (32 - FAST_BITS)) & ((1 << FAST_BITS) - 1);
+        r = fac[c];
+        if (r) {                // fast-AC path
+            k += (r >> 4) & 15; // run
+            s = r & 15;         // combined length
+            if (s > j->code_bits)
+                return stbi__err("bad huffman code", "Combined length longer than code bits available");
+            j->code_buffer <<= s;
+            j->code_bits -= s;
+            // decode into unzigzag'd location
+            zig = stbi__jpeg_dezigzag[k++];
+            data[zig] = (short)((r >> 8) * dequant[zig]);
+        } else {
+            int rs = stbi__jpeg_huff_decode(j, hac);
+            if (rs < 0)
+                return stbi__err("bad huffman code", "Corrupt JPEG");
+            s = rs & 15;
+            r = rs >> 4;
+            if (s == 0) {
+                if (rs != 0xf0)
+                    break; // end block
+                k += 16;
+            } else {
+                k += r;
+                // decode into unzigzag'd location
+                zig = stbi__jpeg_dezigzag[k++];
+                data[zig] = (short)(stbi__extend_receive(j, s) * dequant[zig]);
+            }
+        }
+    } while (k < 64);
+    return 1;
+}
+
+static int stbi__jpeg_decode_block_prog_dc(stbi__jpeg * j, short data[64], stbi__huffman * hdc, int b) {
+    int diff, dc;
+    int t;
+    if (j->spec_end != 0)
+        return stbi__err("can't merge dc and ac", "Corrupt JPEG");
+
+    if (j->code_bits < 16)
+        stbi__grow_buffer_unsafe(j);
+
+    if (j->succ_high == 0) {
+        // first scan for DC coefficient, must be first
+        memset(data, 0, 64 * sizeof(data[0])); // 0 all the ac values now
+        t = stbi__jpeg_huff_decode(j, hdc);
+        if (t < 0 || t > 15)
+            return stbi__err("can't merge dc and ac", "Corrupt JPEG");
+        diff = t ? stbi__extend_receive(j, t) : 0;
+
+        if (!stbi__addints_valid(j->img_comp[b].dc_pred, diff))
+            return stbi__err("bad delta", "Corrupt JPEG");
+        dc = j->img_comp[b].dc_pred + diff;
+        j->img_comp[b].dc_pred = dc;
+        if (!stbi__mul2shorts_valid(dc, 1 << j->succ_low))
+            return stbi__err("can't merge dc and ac", "Corrupt JPEG");
+        data[0] = (short)(dc * (1 << j->succ_low));
+    } else {
+        // refinement scan for DC coefficient
+        if (stbi__jpeg_get_bit(j))
+            data[0] += (short)(1 << j->succ_low);
+    }
+    return 1;
+}
+
+// @OPTIMIZE: store non-zigzagged during the decode passes,
+// and only de-zigzag when dequantizing
+static int stbi__jpeg_decode_block_prog_ac(stbi__jpeg * j, short data[64], stbi__huffman * hac, stbi__int16 * fac) {
+    int k;
+    if (j->spec_start == 0)
+        return stbi__err("can't merge dc and ac", "Corrupt JPEG");
+
+    if (j->succ_high == 0) {
+        int shift = j->succ_low;
+
+        if (j->eob_run) {
+            --j->eob_run;
+            return 1;
+        }
+
+        k = j->spec_start;
+        do {
+            unsigned int zig;
+            int c, r, s;
+            if (j->code_bits < 16)
+                stbi__grow_buffer_unsafe(j);
+            c = (j->code_buffer >> (32 - FAST_BITS)) & ((1 << FAST_BITS) - 1);
+            r = fac[c];
+            if (r) {                // fast-AC path
+                k += (r >> 4) & 15; // run
+                s = r & 15;         // combined length
+                if (s > j->code_bits)
+                    return stbi__err("bad huffman code", "Combined length longer than code bits available");
+                j->code_buffer <<= s;
+                j->code_bits -= s;
+                zig = stbi__jpeg_dezigzag[k++];
+                data[zig] = (short)((r >> 8) * (1 << shift));
+            } else {
+                int rs = stbi__jpeg_huff_decode(j, hac);
+                if (rs < 0)
+                    return stbi__err("bad huffman code", "Corrupt JPEG");
+                s = rs & 15;
+                r = rs >> 4;
+                if (s == 0) {
+                    if (r < 15) {
+                        j->eob_run = (1 << r);
+                        if (r)
+                            j->eob_run += stbi__jpeg_get_bits(j, r);
+                        --j->eob_run;
+                        break;
+                    }
+                    k += 16;
+                } else {
+                    k += r;
+                    zig = stbi__jpeg_dezigzag[k++];
+                    data[zig] = (short)(stbi__extend_receive(j, s) * (1 << shift));
+                }
+            }
+        } while (k <= j->spec_end);
+    } else {
+        // refinement scan for these AC coefficients
+
+        short bit = (short)(1 << j->succ_low);
+
+        if (j->eob_run) {
+            --j->eob_run;
+            for (k = j->spec_start; k <= j->spec_end; ++k) {
+                short * p = &data[stbi__jpeg_dezigzag[k]];
+                if (*p != 0)
+                    if (stbi__jpeg_get_bit(j))
+                        if ((*p & bit) == 0) {
+                            if (*p > 0)
+                                *p += bit;
+                            else
+                                *p -= bit;
+                        }
+            }
+        } else {
+            k = j->spec_start;
+            do {
+                int r, s;
+                int rs = stbi__jpeg_huff_decode(
+                    j, hac); // @OPTIMIZE see if we can use the fast path here, advance-by-r is so slow, eh
+                if (rs < 0)
+                    return stbi__err("bad huffman code", "Corrupt JPEG");
+                s = rs & 15;
+                r = rs >> 4;
+                if (s == 0) {
+                    if (r < 15) {
+                        j->eob_run = (1 << r) - 1;
+                        if (r)
+                            j->eob_run += stbi__jpeg_get_bits(j, r);
+                        r = 64; // force end of block
+                    } else {
+                        // r=15 s=0 should write 16 0s, so we just do
+                        // a run of 15 0s and then write s (which is 0),
+                        // so we don't have to do anything special here
+                    }
+                } else {
+                    if (s != 1)
+                        return stbi__err("bad huffman code", "Corrupt JPEG");
+                    // sign bit
+                    if (stbi__jpeg_get_bit(j))
+                        s = bit;
+                    else
+                        s = -bit;
+                }
+
+                // advance by r
+                while (k <= j->spec_end) {
+                    short * p = &data[stbi__jpeg_dezigzag[k++]];
+                    if (*p != 0) {
+                        if (stbi__jpeg_get_bit(j))
+                            if ((*p & bit) == 0) {
+                                if (*p > 0)
+                                    *p += bit;
+                                else
+                                    *p -= bit;
+                            }
+                    } else {
+                        if (r == 0) {
+                            *p = (short)s;
+                            break;
+                        }
+                        --r;
+                    }
+                }
+            } while (k <= j->spec_end);
+        }
+    }
+    return 1;
+}
+
+// take a -128..127 value and stbi__clamp it and convert to 0..255
+stbi_inline static stbi_uc stbi__clamp(int x) {
+    // trick to use a single test to catch both cases
+    if ((unsigned int)x > 255) {
+        if (x < 0)
+            return 0;
+        if (x > 255)
+            return 255;
+    }
+    return (stbi_uc)x;
+}
+
+#define stbi__f2f(x) ((int)(((x)*4096 + 0.5)))
+#define stbi__fsh(x) ((x)*4096)
+
+// derived from jidctint -- DCT_ISLOW
+#define STBI__IDCT_1D(s0, s1, s2, s3, s4, s5, s6, s7)                                                                          \
+    int t0, t1, t2, t3, p1, p2, p3, p4, p5, x0, x1, x2, x3;                                                                    \
+    p2 = s2;                                                                                                                   \
+    p3 = s6;                                                                                                                   \
+    p1 = (p2 + p3) * stbi__f2f(0.5411961f);                                                                                    \
+    t2 = p1 + p3 * stbi__f2f(-1.847759065f);                                                                                   \
+    t3 = p1 + p2 * stbi__f2f(0.765366865f);                                                                                    \
+    p2 = s0;                                                                                                                   \
+    p3 = s4;                                                                                                                   \
+    t0 = stbi__fsh(p2 + p3);                                                                                                   \
+    t1 = stbi__fsh(p2 - p3);                                                                                                   \
+    x0 = t0 + t3;                                                                                                              \
+    x3 = t0 - t3;                                                                                                              \
+    x1 = t1 + t2;                                                                                                              \
+    x2 = t1 - t2;                                                                                                              \
+    t0 = s7;                                                                                                                   \
+    t1 = s5;                                                                                                                   \
+    t2 = s3;                                                                                                                   \
+    t3 = s1;                                                                                                                   \
+    p3 = t0 + t2;                                                                                                              \
+    p4 = t1 + t3;                                                                                                              \
+    p1 = t0 + t3;                                                                                                              \
+    p2 = t1 + t2;                                                                                                              \
+    p5 = (p3 + p4) * stbi__f2f(1.175875602f);                                                                                  \
+    t0 = t0 * stbi__f2f(0.298631336f);                                                                                         \
+    t1 = t1 * stbi__f2f(2.053119869f);                                                                                         \
+    t2 = t2 * stbi__f2f(3.072711026f);                                                                                         \
+    t3 = t3 * stbi__f2f(1.501321110f);                                                                                         \
+    p1 = p5 + p1 * stbi__f2f(-0.899976223f);                                                                                   \
+    p2 = p5 + p2 * stbi__f2f(-2.562915447f);                                                                                   \
+    p3 = p3 * stbi__f2f(-1.961570560f);                                                                                        \
+    p4 = p4 * stbi__f2f(-0.390180644f);                                                                                        \
+    t3 += p1 + p4;                                                                                                             \
+    t2 += p2 + p3;                                                                                                             \
+    t1 += p2 + p4;                                                                                                             \
+    t0 += p1 + p3;
+
+static void stbi__idct_block(stbi_uc * out, int out_stride, short data[64]) {
+    int i, val[64], *v = val;
+    stbi_uc * o;
+    short * d = data;
+
+    // columns
+    for (i = 0; i < 8; ++i, ++d, ++v) {
+        // if all zeroes, shortcut -- this avoids dequantizing 0s and IDCTing
+        if (d[8] == 0 && d[16] == 0 && d[24] == 0 && d[32] == 0 && d[40] == 0 && d[48] == 0 && d[56] == 0) {
+            //    no shortcut                 0     seconds
+            //    (1|2|3|4|5|6|7)==0          0     seconds
+            //    all separate               -0.047 seconds
+            //    1 && 2|3 && 4|5 && 6|7:    -0.047 seconds
+            int dcterm = d[0] * 4;
+            v[0] = v[8] = v[16] = v[24] = v[32] = v[40] = v[48] = v[56] = dcterm;
+        } else {
+            STBI__IDCT_1D(d[0], d[8], d[16], d[24], d[32], d[40], d[48], d[56])
+            // constants scaled things up by 1<<12; let's bring them back
+            // down, but keep 2 extra bits of precision
+            x0 += 512;
+            x1 += 512;
+            x2 += 512;
+            x3 += 512;
+            v[0] = (x0 + t3) >> 10;
+            v[56] = (x0 - t3) >> 10;
+            v[8] = (x1 + t2) >> 10;
+            v[48] = (x1 - t2) >> 10;
+            v[16] = (x2 + t1) >> 10;
+            v[40] = (x2 - t1) >> 10;
+            v[24] = (x3 + t0) >> 10;
+            v[32] = (x3 - t0) >> 10;
+        }
+    }
+
+    for (i = 0, v = val, o = out; i < 8; ++i, v += 8, o += out_stride) {
+        // no fast case since the first 1D IDCT spread components out
+        STBI__IDCT_1D(v[0], v[1], v[2], v[3], v[4], v[5], v[6], v[7])
+        // constants scaled things up by 1<<12, plus we had 1<<2 from first
+        // loop, plus horizontal and vertical each scale by sqrt(8) so together
+        // we've got an extra 1<<3, so 1<<17 total we need to remove.
+        // so we want to round that, which means adding 0.5 * 1<<17,
+        // aka 65536. Also, we'll end up with -128 to 127 that we want
+        // to encode as 0..255 by adding 128, so we'll add that before the shift
+        x0 += 65536 + (128 << 17);
+        x1 += 65536 + (128 << 17);
+        x2 += 65536 + (128 << 17);
+        x3 += 65536 + (128 << 17);
+        // tried computing the shifts into temps, or'ing the temps to see
+        // if any were out of range, but that was slower
+        o[0] = stbi__clamp((x0 + t3) >> 17);
+        o[7] = stbi__clamp((x0 - t3) >> 17);
+        o[1] = stbi__clamp((x1 + t2) >> 17);
+        o[6] = stbi__clamp((x1 - t2) >> 17);
+        o[2] = stbi__clamp((x2 + t1) >> 17);
+        o[5] = stbi__clamp((x2 - t1) >> 17);
+        o[3] = stbi__clamp((x3 + t0) >> 17);
+        o[4] = stbi__clamp((x3 - t0) >> 17);
+    }
+}
+
+#ifdef STBI_SSE2
+// sse2 integer IDCT. not the fastest possible implementation but it
+// produces bit-identical results to the generic C version so it's
+// fully "transparent".
+static void stbi__idct_simd(stbi_uc * out, int out_stride, short data[64]) {
+    // This is constructed to match our regular (generic) integer IDCT exactly.
+    __m128i row0, row1, row2, row3, row4, row5, row6, row7;
+    __m128i tmp;
+
+// dot product constant: even elems=x, odd elems=y
+#define dct_const(x, y) _mm_setr_epi16((x), (y), (x), (y), (x), (y), (x), (y))
+
+// out(0) = c0[even]*x + c0[odd]*y   (c0, x, y 16-bit, out 32-bit)
+// out(1) = c1[even]*x + c1[odd]*y
+#define dct_rot(out0, out1, x, y, c0, c1)                                                                                      \
+    __m128i c0##lo = _mm_unpacklo_epi16((x), (y));                                                                             \
+    __m128i c0##hi = _mm_unpackhi_epi16((x), (y));                                                                             \
+    __m128i out0##_l = _mm_madd_epi16(c0##lo, c0);                                                                             \
+    __m128i out0##_h = _mm_madd_epi16(c0##hi, c0);                                                                             \
+    __m128i out1##_l = _mm_madd_epi16(c0##lo, c1);                                                                             \
+    __m128i out1##_h = _mm_madd_epi16(c0##hi, c1)
+
+// out = in << 12  (in 16-bit, out 32-bit)
+#define dct_widen(out, in)                                                                                                     \
+    __m128i out##_l = _mm_srai_epi32(_mm_unpacklo_epi16(_mm_setzero_si128(), (in)), 4);                                        \
+    __m128i out##_h = _mm_srai_epi32(_mm_unpackhi_epi16(_mm_setzero_si128(), (in)), 4)
+
+// wide add
+#define dct_wadd(out, a, b)                                                                                                    \
+    __m128i out##_l = _mm_add_epi32(a##_l, b##_l);                                                                             \
+    __m128i out##_h = _mm_add_epi32(a##_h, b##_h)
+
+// wide sub
+#define dct_wsub(out, a, b)                                                                                                    \
+    __m128i out##_l = _mm_sub_epi32(a##_l, b##_l);                                                                             \
+    __m128i out##_h = _mm_sub_epi32(a##_h, b##_h)
+
+// butterfly a/b, add bias, then shift by "s" and pack
+#define dct_bfly32o(out0, out1, a, b, bias, s)                                                                                 \
+    {                                                                                                                          \
+        __m128i abiased_l = _mm_add_epi32(a##_l, bias);                                                                        \
+        __m128i abiased_h = _mm_add_epi32(a##_h, bias);                                                                        \
+        dct_wadd(sum, abiased, b);                                                                                             \
+        dct_wsub(dif, abiased, b);                                                                                             \
+        out0 = _mm_packs_epi32(_mm_srai_epi32(sum_l, s), _mm_srai_epi32(sum_h, s));                                            \
+        out1 = _mm_packs_epi32(_mm_srai_epi32(dif_l, s), _mm_srai_epi32(dif_h, s));                                            \
+    }
+
+// 8-bit interleave step (for transposes)
+#define dct_interleave8(a, b)                                                                                                  \
+    tmp = a;                                                                                                                   \
+    a = _mm_unpacklo_epi8(a, b);                                                                                               \
+    b = _mm_unpackhi_epi8(tmp, b)
+
+// 16-bit interleave step (for transposes)
+#define dct_interleave16(a, b)                                                                                                 \
+    tmp = a;                                                                                                                   \
+    a = _mm_unpacklo_epi16(a, b);                                                                                              \
+    b = _mm_unpackhi_epi16(tmp, b)
+
+#define dct_pass(bias, shift)                                                                                                  \
+    {                                                                                                                          \
+        /* even part */                                                                                                        \
+        dct_rot(t2e, t3e, row2, row6, rot0_0, rot0_1);                                                                         \
+        __m128i sum04 = _mm_add_epi16(row0, row4);                                                                             \
+        __m128i dif04 = _mm_sub_epi16(row0, row4);                                                                             \
+        dct_widen(t0e, sum04);                                                                                                 \
+        dct_widen(t1e, dif04);                                                                                                 \
+        dct_wadd(x0, t0e, t3e);                                                                                                \
+        dct_wsub(x3, t0e, t3e);                                                                                                \
+        dct_wadd(x1, t1e, t2e);                                                                                                \
+        dct_wsub(x2, t1e, t2e);                                                                                                \
+        /* odd part */                                                                                                         \
+        dct_rot(y0o, y2o, row7, row3, rot2_0, rot2_1);                                                                         \
+        dct_rot(y1o, y3o, row5, row1, rot3_0, rot3_1);                                                                         \
+        __m128i sum17 = _mm_add_epi16(row1, row7);                                                                             \
+        __m128i sum35 = _mm_add_epi16(row3, row5);                                                                             \
+        dct_rot(y4o, y5o, sum17, sum35, rot1_0, rot1_1);                                                                       \
+        dct_wadd(x4, y0o, y4o);                                                                                                \
+        dct_wadd(x5, y1o, y5o);                                                                                                \
+        dct_wadd(x6, y2o, y5o);                                                                                                \
+        dct_wadd(x7, y3o, y4o);                                                                                                \
+        dct_bfly32o(row0, row7, x0, x7, bias, shift);                                                                          \
+        dct_bfly32o(row1, row6, x1, x6, bias, shift);                                                                          \
+        dct_bfly32o(row2, row5, x2, x5, bias, shift);                                                                          \
+        dct_bfly32o(row3, row4, x3, x4, bias, shift);                                                                          \
+    }
+
+    __m128i rot0_0 = dct_const(stbi__f2f(0.5411961f), stbi__f2f(0.5411961f) + stbi__f2f(-1.847759065f));
+    __m128i rot0_1 = dct_const(stbi__f2f(0.5411961f) + stbi__f2f(0.765366865f), stbi__f2f(0.5411961f));
+    __m128i rot1_0 = dct_const(stbi__f2f(1.175875602f) + stbi__f2f(-0.899976223f), stbi__f2f(1.175875602f));
+    __m128i rot1_1 = dct_const(stbi__f2f(1.175875602f), stbi__f2f(1.175875602f) + stbi__f2f(-2.562915447f));
+    __m128i rot2_0 = dct_const(stbi__f2f(-1.961570560f) + stbi__f2f(0.298631336f), stbi__f2f(-1.961570560f));
+    __m128i rot2_1 = dct_const(stbi__f2f(-1.961570560f), stbi__f2f(-1.961570560f) + stbi__f2f(3.072711026f));
+    __m128i rot3_0 = dct_const(stbi__f2f(-0.390180644f) + stbi__f2f(2.053119869f), stbi__f2f(-0.390180644f));
+    __m128i rot3_1 = dct_const(stbi__f2f(-0.390180644f), stbi__f2f(-0.390180644f) + stbi__f2f(1.501321110f));
+
+    // rounding biases in column/row passes, see stbi__idct_block for explanation.
+    __m128i bias_0 = _mm_set1_epi32(512);
+    __m128i bias_1 = _mm_set1_epi32(65536 + (128 << 17));
+
+    // load
+    row0 = _mm_load_si128((const __m128i *)(data + 0 * 8));
+    row1 = _mm_load_si128((const __m128i *)(data + 1 * 8));
+    row2 = _mm_load_si128((const __m128i *)(data + 2 * 8));
+    row3 = _mm_load_si128((const __m128i *)(data + 3 * 8));
+    row4 = _mm_load_si128((const __m128i *)(data + 4 * 8));
+    row5 = _mm_load_si128((const __m128i *)(data + 5 * 8));
+    row6 = _mm_load_si128((const __m128i *)(data + 6 * 8));
+    row7 = _mm_load_si128((const __m128i *)(data + 7 * 8));
+
+    // column pass
+    dct_pass(bias_0, 10);
+
+    {
+        // 16bit 8x8 transpose pass 1
+        dct_interleave16(row0, row4);
+        dct_interleave16(row1, row5);
+        dct_interleave16(row2, row6);
+        dct_interleave16(row3, row7);
+
+        // transpose pass 2
+        dct_interleave16(row0, row2);
+        dct_interleave16(row1, row3);
+        dct_interleave16(row4, row6);
+        dct_interleave16(row5, row7);
+
+        // transpose pass 3
+        dct_interleave16(row0, row1);
+        dct_interleave16(row2, row3);
+        dct_interleave16(row4, row5);
+        dct_interleave16(row6, row7);
+    }
+
+    // row pass
+    dct_pass(bias_1, 17);
+
+    {
+        // pack
+        __m128i p0 = _mm_packus_epi16(row0, row1); // a0a1a2a3...a7b0b1b2b3...b7
+        __m128i p1 = _mm_packus_epi16(row2, row3);
+        __m128i p2 = _mm_packus_epi16(row4, row5);
+        __m128i p3 = _mm_packus_epi16(row6, row7);
+
+        // 8bit 8x8 transpose pass 1
+        dct_interleave8(p0, p2); // a0e0a1e1...
+        dct_interleave8(p1, p3); // c0g0c1g1...
+
+        // transpose pass 2
+        dct_interleave8(p0, p1); // a0c0e0g0...
+        dct_interleave8(p2, p3); // b0d0f0h0...
+
+        // transpose pass 3
+        dct_interleave8(p0, p2); // a0b0c0d0...
+        dct_interleave8(p1, p3); // a4b4c4d4...
+
+        // store
+        _mm_storel_epi64((__m128i *)out, p0);
+        out += out_stride;
+        _mm_storel_epi64((__m128i *)out, _mm_shuffle_epi32(p0, 0x4e));
+        out += out_stride;
+        _mm_storel_epi64((__m128i *)out, p2);
+        out += out_stride;
+        _mm_storel_epi64((__m128i *)out, _mm_shuffle_epi32(p2, 0x4e));
+        out += out_stride;
+        _mm_storel_epi64((__m128i *)out, p1);
+        out += out_stride;
+        _mm_storel_epi64((__m128i *)out, _mm_shuffle_epi32(p1, 0x4e));
+        out += out_stride;
+        _mm_storel_epi64((__m128i *)out, p3);
+        out += out_stride;
+        _mm_storel_epi64((__m128i *)out, _mm_shuffle_epi32(p3, 0x4e));
+    }
+
+#undef dct_const
+#undef dct_rot
+#undef dct_widen
+#undef dct_wadd
+#undef dct_wsub
+#undef dct_bfly32o
+#undef dct_interleave8
+#undef dct_interleave16
+#undef dct_pass
+}
+
+#endif // STBI_SSE2
+
+#ifdef STBI_NEON
+
+// NEON integer IDCT. should produce bit-identical
+// results to the generic C version.
+static void stbi__idct_simd(stbi_uc * out, int out_stride, short data[64]) {
+    int16x8_t row0, row1, row2, row3, row4, row5, row6, row7;
+
+    int16x4_t rot0_0 = vdup_n_s16(stbi__f2f(0.5411961f));
+    int16x4_t rot0_1 = vdup_n_s16(stbi__f2f(-1.847759065f));
+    int16x4_t rot0_2 = vdup_n_s16(stbi__f2f(0.765366865f));
+    int16x4_t rot1_0 = vdup_n_s16(stbi__f2f(1.175875602f));
+    int16x4_t rot1_1 = vdup_n_s16(stbi__f2f(-0.899976223f));
+    int16x4_t rot1_2 = vdup_n_s16(stbi__f2f(-2.562915447f));
+    int16x4_t rot2_0 = vdup_n_s16(stbi__f2f(-1.961570560f));
+    int16x4_t rot2_1 = vdup_n_s16(stbi__f2f(-0.390180644f));
+    int16x4_t rot3_0 = vdup_n_s16(stbi__f2f(0.298631336f));
+    int16x4_t rot3_1 = vdup_n_s16(stbi__f2f(2.053119869f));
+    int16x4_t rot3_2 = vdup_n_s16(stbi__f2f(3.072711026f));
+    int16x4_t rot3_3 = vdup_n_s16(stbi__f2f(1.501321110f));
+
+#define dct_long_mul(out, inq, coeff)                                                                                          \
+    int32x4_t out##_l = vmull_s16(vget_low_s16(inq), coeff);                                                                   \
+    int32x4_t out##_h = vmull_s16(vget_high_s16(inq), coeff)
+
+#define dct_long_mac(out, acc, inq, coeff)                                                                                     \
+    int32x4_t out##_l = vmlal_s16(acc##_l, vget_low_s16(inq), coeff);                                                          \
+    int32x4_t out##_h = vmlal_s16(acc##_h, vget_high_s16(inq), coeff)
+
+#define dct_widen(out, inq)                                                                                                    \
+    int32x4_t out##_l = vshll_n_s16(vget_low_s16(inq), 12);                                                                    \
+    int32x4_t out##_h = vshll_n_s16(vget_high_s16(inq), 12)
+
+// wide add
+#define dct_wadd(out, a, b)                                                                                                    \
+    int32x4_t out##_l = vaddq_s32(a##_l, b##_l);                                                                               \
+    int32x4_t out##_h = vaddq_s32(a##_h, b##_h)
+
+// wide sub
+#define dct_wsub(out, a, b)                                                                                                    \
+    int32x4_t out##_l = vsubq_s32(a##_l, b##_l);                                                                               \
+    int32x4_t out##_h = vsubq_s32(a##_h, b##_h)
+
+// butterfly a/b, then shift using "shiftop" by "s" and pack
+#define dct_bfly32o(out0, out1, a, b, shiftop, s)                                                                              \
+    {                                                                                                                          \
+        dct_wadd(sum, a, b);                                                                                                   \
+        dct_wsub(dif, a, b);                                                                                                   \
+        out0 = vcombine_s16(shiftop(sum_l, s), shiftop(sum_h, s));                                                             \
+        out1 = vcombine_s16(shiftop(dif_l, s), shiftop(dif_h, s));                                                             \
+    }
+
+#define dct_pass(shiftop, shift)                                                                                               \
+    {                                                                                                                          \
+        /* even part */                                                                                                        \
+        int16x8_t sum26 = vaddq_s16(row2, row6);                                                                               \
+        dct_long_mul(p1e, sum26, rot0_0);                                                                                      \
+        dct_long_mac(t2e, p1e, row6, rot0_1);                                                                                  \
+        dct_long_mac(t3e, p1e, row2, rot0_2);                                                                                  \
+        int16x8_t sum04 = vaddq_s16(row0, row4);                                                                               \
+        int16x8_t dif04 = vsubq_s16(row0, row4);                                                                               \
+        dct_widen(t0e, sum04);                                                                                                 \
+        dct_widen(t1e, dif04);                                                                                                 \
+        dct_wadd(x0, t0e, t3e);                                                                                                \
+        dct_wsub(x3, t0e, t3e);                                                                                                \
+        dct_wadd(x1, t1e, t2e);                                                                                                \
+        dct_wsub(x2, t1e, t2e);                                                                                                \
+        /* odd part */                                                                                                         \
+        int16x8_t sum15 = vaddq_s16(row1, row5);                                                                               \
+        int16x8_t sum17 = vaddq_s16(row1, row7);                                                                               \
+        int16x8_t sum35 = vaddq_s16(row3, row5);                                                                               \
+        int16x8_t sum37 = vaddq_s16(row3, row7);                                                                               \
+        int16x8_t sumodd = vaddq_s16(sum17, sum35);                                                                            \
+        dct_long_mul(p5o, sumodd, rot1_0);                                                                                     \
+        dct_long_mac(p1o, p5o, sum17, rot1_1);                                                                                 \
+        dct_long_mac(p2o, p5o, sum35, rot1_2);                                                                                 \
+        dct_long_mul(p3o, sum37, rot2_0);                                                                                      \
+        dct_long_mul(p4o, sum15, rot2_1);                                                                                      \
+        dct_wadd(sump13o, p1o, p3o);                                                                                           \
+        dct_wadd(sump24o, p2o, p4o);                                                                                           \
+        dct_wadd(sump23o, p2o, p3o);                                                                                           \
+        dct_wadd(sump14o, p1o, p4o);                                                                                           \
+        dct_long_mac(x4, sump13o, row7, rot3_0);                                                                               \
+        dct_long_mac(x5, sump24o, row5, rot3_1);                                                                               \
+        dct_long_mac(x6, sump23o, row3, rot3_2);                                                                               \
+        dct_long_mac(x7, sump14o, row1, rot3_3);                                                                               \
+        dct_bfly32o(row0, row7, x0, x7, shiftop, shift);                                                                       \
+        dct_bfly32o(row1, row6, x1, x6, shiftop, shift);                                                                       \
+        dct_bfly32o(row2, row5, x2, x5, shiftop, shift);                                                                       \
+        dct_bfly32o(row3, row4, x3, x4, shiftop, shift);                                                                       \
+    }
+
+    // load
+    row0 = vld1q_s16(data + 0 * 8);
+    row1 = vld1q_s16(data + 1 * 8);
+    row2 = vld1q_s16(data + 2 * 8);
+    row3 = vld1q_s16(data + 3 * 8);
+    row4 = vld1q_s16(data + 4 * 8);
+    row5 = vld1q_s16(data + 5 * 8);
+    row6 = vld1q_s16(data + 6 * 8);
+    row7 = vld1q_s16(data + 7 * 8);
+
+    // add DC bias
+    row0 = vaddq_s16(row0, vsetq_lane_s16(1024, vdupq_n_s16(0), 0));
+
+    // column pass
+    dct_pass(vrshrn_n_s32, 10);
+
+    // 16bit 8x8 transpose
+    {
+// these three map to a single VTRN.16, VTRN.32, and VSWP, respectively.
+// whether compilers actually get this is another story, sadly.
+#define dct_trn16(x, y)                                                                                                        \
+    {                                                                                                                          \
+        int16x8x2_t t = vtrnq_s16(x, y);                                                                                       \
+        x = t.val[0];                                                                                                          \
+        y = t.val[1];                                                                                                          \
+    }
+#define dct_trn32(x, y)                                                                                                        \
+    {                                                                                                                          \
+        int32x4x2_t t = vtrnq_s32(vreinterpretq_s32_s16(x), vreinterpretq_s32_s16(y));                                         \
+        x = vreinterpretq_s16_s32(t.val[0]);                                                                                   \
+        y = vreinterpretq_s16_s32(t.val[1]);                                                                                   \
+    }
+#define dct_trn64(x, y)                                                                                                        \
+    {                                                                                                                          \
+        int16x8_t x0 = x;                                                                                                      \
+        int16x8_t y0 = y;                                                                                                      \
+        x = vcombine_s16(vget_low_s16(x0), vget_low_s16(y0));                                                                  \
+        y = vcombine_s16(vget_high_s16(x0), vget_high_s16(y0));                                                                \
+    }
+
+        // pass 1
+        dct_trn16(row0, row1); // a0b0a2b2a4b4a6b6
+        dct_trn16(row2, row3);
+        dct_trn16(row4, row5);
+        dct_trn16(row6, row7);
+
+        // pass 2
+        dct_trn32(row0, row2); // a0b0c0d0a4b4c4d4
+        dct_trn32(row1, row3);
+        dct_trn32(row4, row6);
+        dct_trn32(row5, row7);
+
+        // pass 3
+        dct_trn64(row0, row4); // a0b0c0d0e0f0g0h0
+        dct_trn64(row1, row5);
+        dct_trn64(row2, row6);
+        dct_trn64(row3, row7);
+
+#undef dct_trn16
+#undef dct_trn32
+#undef dct_trn64
+    }
+
+    // row pass
+    // vrshrn_n_s32 only supports shifts up to 16, we need
+    // 17. so do a non-rounding shift of 16 first then follow
+    // up with a rounding shift by 1.
+    dct_pass(vshrn_n_s32, 16);
+
+    {
+        // pack and round
+        uint8x8_t p0 = vqrshrun_n_s16(row0, 1);
+        uint8x8_t p1 = vqrshrun_n_s16(row1, 1);
+        uint8x8_t p2 = vqrshrun_n_s16(row2, 1);
+        uint8x8_t p3 = vqrshrun_n_s16(row3, 1);
+        uint8x8_t p4 = vqrshrun_n_s16(row4, 1);
+        uint8x8_t p5 = vqrshrun_n_s16(row5, 1);
+        uint8x8_t p6 = vqrshrun_n_s16(row6, 1);
+        uint8x8_t p7 = vqrshrun_n_s16(row7, 1);
+
+        // again, these can translate into one instruction, but often don't.
+#define dct_trn8_8(x, y)                                                                                                       \
+    {                                                                                                                          \
+        uint8x8x2_t t = vtrn_u8(x, y);                                                                                         \
+        x = t.val[0];                                                                                                          \
+        y = t.val[1];                                                                                                          \
+    }
+#define dct_trn8_16(x, y)                                                                                                      \
+    {                                                                                                                          \
+        uint16x4x2_t t = vtrn_u16(vreinterpret_u16_u8(x), vreinterpret_u16_u8(y));                                             \
+        x = vreinterpret_u8_u16(t.val[0]);                                                                                     \
+        y = vreinterpret_u8_u16(t.val[1]);                                                                                     \
+    }
+#define dct_trn8_32(x, y)                                                                                                      \
+    {                                                                                                                          \
+        uint32x2x2_t t = vtrn_u32(vreinterpret_u32_u8(x), vreinterpret_u32_u8(y));                                             \
+        x = vreinterpret_u8_u32(t.val[0]);                                                                                     \
+        y = vreinterpret_u8_u32(t.val[1]);                                                                                     \
+    }
+
+        // sadly can't use interleaved stores here since we only write
+        // 8 bytes to each scan line!
+
+        // 8x8 8-bit transpose pass 1
+        dct_trn8_8(p0, p1);
+        dct_trn8_8(p2, p3);
+        dct_trn8_8(p4, p5);
+        dct_trn8_8(p6, p7);
+
+        // pass 2
+        dct_trn8_16(p0, p2);
+        dct_trn8_16(p1, p3);
+        dct_trn8_16(p4, p6);
+        dct_trn8_16(p5, p7);
+
+        // pass 3
+        dct_trn8_32(p0, p4);
+        dct_trn8_32(p1, p5);
+        dct_trn8_32(p2, p6);
+        dct_trn8_32(p3, p7);
+
+        // store
+        vst1_u8(out, p0);
+        out += out_stride;
+        vst1_u8(out, p1);
+        out += out_stride;
+        vst1_u8(out, p2);
+        out += out_stride;
+        vst1_u8(out, p3);
+        out += out_stride;
+        vst1_u8(out, p4);
+        out += out_stride;
+        vst1_u8(out, p5);
+        out += out_stride;
+        vst1_u8(out, p6);
+        out += out_stride;
+        vst1_u8(out, p7);
+
+#undef dct_trn8_8
+#undef dct_trn8_16
+#undef dct_trn8_32
+    }
+
+#undef dct_long_mul
+#undef dct_long_mac
+#undef dct_widen
+#undef dct_wadd
+#undef dct_wsub
+#undef dct_bfly32o
+#undef dct_pass
+}
+
+#endif // STBI_NEON
+
+#define STBI__MARKER_none 0xff
+// if there's a pending marker from the entropy stream, return that
+// otherwise, fetch from the stream and get a marker. if there's no
+// marker, return 0xff, which is never a valid marker value
+static stbi_uc stbi__get_marker(stbi__jpeg * j) {
+    stbi_uc x;
+    if (j->marker != STBI__MARKER_none) {
+        x = j->marker;
+        j->marker = STBI__MARKER_none;
+        return x;
+    }
+    x = stbi__get8(j->s);
+    if (x != 0xff)
+        return STBI__MARKER_none;
+    while (x == 0xff)
+        x = stbi__get8(j->s); // consume repeated 0xff fill bytes
+    return x;
+}
+
+// in each scan, we'll have scan_n components, and the order
+// of the components is specified by order[]
+#define STBI__RESTART(x) ((x) >= 0xd0 && (x) <= 0xd7)
+
+// after a restart interval, stbi__jpeg_reset the entropy decoder and
+// the dc prediction
+static void stbi__jpeg_reset(stbi__jpeg * j) {
+    j->code_bits = 0;
+    j->code_buffer = 0;
+    j->nomore = 0;
+    j->img_comp[0].dc_pred = j->img_comp[1].dc_pred = j->img_comp[2].dc_pred = j->img_comp[3].dc_pred = 0;
+    j->marker = STBI__MARKER_none;
+    j->todo = j->restart_interval ? j->restart_interval : 0x7fffffff;
+    j->eob_run = 0;
+    // no more than 1<<31 MCUs if no restart_interal? that's plenty safe,
+    // since we don't even allow 1<<30 pixels
+}
+
+static int stbi__parse_entropy_coded_data(stbi__jpeg * z) {
+    stbi__jpeg_reset(z);
+    if (!z->progressive) {
+        if (z->scan_n == 1) {
+            int i, j;
+            STBI_SIMD_ALIGN(short, data[64]);
+            int n = z->order[0];
+            // non-interleaved data, we just need to process one block at a time,
+            // in trivial scanline order
+            // number of blocks to do just depends on how many actual "pixels" this
+            // component has, independent of interleaved MCU blocking and such
+            int w = (z->img_comp[n].x + 7) >> 3;
+            int h = (z->img_comp[n].y + 7) >> 3;
+            for (j = 0; j < h; ++j) {
+                for (i = 0; i < w; ++i) {
+                    int ha = z->img_comp[n].ha;
+                    if (!stbi__jpeg_decode_block(z, data, z->huff_dc + z->img_comp[n].hd, z->huff_ac + ha, z->fast_ac[ha], n,
+                                                 z->dequant[z->img_comp[n].tq]))
+                        return 0;
+                    z->idct_block_kernel(z->img_comp[n].data + z->img_comp[n].w2 * j * 8 + i * 8, z->img_comp[n].w2, data);
+                    // every data block is an MCU, so countdown the restart interval
+                    if (--z->todo <= 0) {
+                        if (z->code_bits < 24)
+                            stbi__grow_buffer_unsafe(z);
+                        // if it's NOT a restart, then just bail, so we get corrupt data
+                        // rather than no data
+                        if (!STBI__RESTART(z->marker))
+                            return 1;
+                        stbi__jpeg_reset(z);
+                    }
+                }
+            }
+            return 1;
+        } else { // interleaved
+            int i, j, k, x, y;
+            STBI_SIMD_ALIGN(short, data[64]);
+            for (j = 0; j < z->img_mcu_y; ++j) {
+                for (i = 0; i < z->img_mcu_x; ++i) {
+                    // scan an interleaved mcu... process scan_n components in order
+                    for (k = 0; k < z->scan_n; ++k) {
+                        int n = z->order[k];
+                        // scan out an mcu's worth of this component; that's just determined
+                        // by the basic H and V specified for the component
+                        for (y = 0; y < z->img_comp[n].v; ++y) {
+                            for (x = 0; x < z->img_comp[n].h; ++x) {
+                                int x2 = (i * z->img_comp[n].h + x) * 8;
+                                int y2 = (j * z->img_comp[n].v + y) * 8;
+                                int ha = z->img_comp[n].ha;
+                                if (!stbi__jpeg_decode_block(z, data, z->huff_dc + z->img_comp[n].hd, z->huff_ac + ha,
+                                                             z->fast_ac[ha], n, z->dequant[z->img_comp[n].tq]))
+                                    return 0;
+                                z->idct_block_kernel(z->img_comp[n].data + z->img_comp[n].w2 * y2 + x2, z->img_comp[n].w2,
+                                                     data);
+                            }
+                        }
+                    }
+                    // after all interleaved components, that's an interleaved MCU,
+                    // so now count down the restart interval
+                    if (--z->todo <= 0) {
+                        if (z->code_bits < 24)
+                            stbi__grow_buffer_unsafe(z);
+                        if (!STBI__RESTART(z->marker))
+                            return 1;
+                        stbi__jpeg_reset(z);
+                    }
+                }
+            }
+            return 1;
+        }
+    } else {
+        if (z->scan_n == 1) {
+            int i, j;
+            int n = z->order[0];
+            // non-interleaved data, we just need to process one block at a time,
+            // in trivial scanline order
+            // number of blocks to do just depends on how many actual "pixels" this
+            // component has, independent of interleaved MCU blocking and such
+            int w = (z->img_comp[n].x + 7) >> 3;
+            int h = (z->img_comp[n].y + 7) >> 3;
+            for (j = 0; j < h; ++j) {
+                for (i = 0; i < w; ++i) {
+                    short * data = z->img_comp[n].coeff + 64 * (i + j * z->img_comp[n].coeff_w);
+                    if (z->spec_start == 0) {
+                        if (!stbi__jpeg_decode_block_prog_dc(z, data, &z->huff_dc[z->img_comp[n].hd], n))
+                            return 0;
+                    } else {
+                        int ha = z->img_comp[n].ha;
+                        if (!stbi__jpeg_decode_block_prog_ac(z, data, &z->huff_ac[ha], z->fast_ac[ha]))
+                            return 0;
+                    }
+                    // every data block is an MCU, so countdown the restart interval
+                    if (--z->todo <= 0) {
+                        if (z->code_bits < 24)
+                            stbi__grow_buffer_unsafe(z);
+                        if (!STBI__RESTART(z->marker))
+                            return 1;
+                        stbi__jpeg_reset(z);
+                    }
+                }
+            }
+            return 1;
+        } else { // interleaved
+            int i, j, k, x, y;
+            for (j = 0; j < z->img_mcu_y; ++j) {
+                for (i = 0; i < z->img_mcu_x; ++i) {
+                    // scan an interleaved mcu... process scan_n components in order
+                    for (k = 0; k < z->scan_n; ++k) {
+                        int n = z->order[k];
+                        // scan out an mcu's worth of this component; that's just determined
+                        // by the basic H and V specified for the component
+                        for (y = 0; y < z->img_comp[n].v; ++y) {
+                            for (x = 0; x < z->img_comp[n].h; ++x) {
+                                int x2 = (i * z->img_comp[n].h + x);
+                                int y2 = (j * z->img_comp[n].v + y);
+                                short * data = z->img_comp[n].coeff + 64 * (x2 + y2 * z->img_comp[n].coeff_w);
+                                if (!stbi__jpeg_decode_block_prog_dc(z, data, &z->huff_dc[z->img_comp[n].hd], n))
+                                    return 0;
+                            }
+                        }
+                    }
+                    // after all interleaved components, that's an interleaved MCU,
+                    // so now count down the restart interval
+                    if (--z->todo <= 0) {
+                        if (z->code_bits < 24)
+                            stbi__grow_buffer_unsafe(z);
+                        if (!STBI__RESTART(z->marker))
+                            return 1;
+                        stbi__jpeg_reset(z);
+                    }
+                }
+            }
+            return 1;
+        }
+    }
+}
+
+static void stbi__jpeg_dequantize(short * data, stbi__uint16 * dequant) {
+    int i;
+    for (i = 0; i < 64; ++i)
+        data[i] *= dequant[i];
+}
+
+static void stbi__jpeg_finish(stbi__jpeg * z) {
+    if (z->progressive) {
+        // dequantize and idct the data
+        int i, j, n;
+        for (n = 0; n < z->s->img_n; ++n) {
+            int w = (z->img_comp[n].x + 7) >> 3;
+            int h = (z->img_comp[n].y + 7) >> 3;
+            for (j = 0; j < h; ++j) {
+                for (i = 0; i < w; ++i) {
+                    short * data = z->img_comp[n].coeff + 64 * (i + j * z->img_comp[n].coeff_w);
+                    stbi__jpeg_dequantize(data, z->dequant[z->img_comp[n].tq]);
+                    z->idct_block_kernel(z->img_comp[n].data + z->img_comp[n].w2 * j * 8 + i * 8, z->img_comp[n].w2, data);
+                }
+            }
+        }
+    }
+}
+
+static int stbi__process_marker(stbi__jpeg * z, int m) {
+    int L;
+    switch (m) {
+    case STBI__MARKER_none: // no marker found
+        return stbi__err("expected marker", "Corrupt JPEG");
+
+    case 0xDD: // DRI - specify restart interval
+        if (stbi__get16be(z->s) != 4)
+            return stbi__err("bad DRI len", "Corrupt JPEG");
+        z->restart_interval = stbi__get16be(z->s);
+        return 1;
+
+    case 0xDB: // DQT - define quantization table
+        L = stbi__get16be(z->s) - 2;
+        while (L > 0) {
+            int q = stbi__get8(z->s);
+            int p = q >> 4, sixteen = (p != 0);
+            int t = q & 15, i;
+            if (p != 0 && p != 1)
+                return stbi__err("bad DQT type", "Corrupt JPEG");
+            if (t > 3)
+                return stbi__err("bad DQT table", "Corrupt JPEG");
+
+            for (i = 0; i < 64; ++i)
+                z->dequant[t][stbi__jpeg_dezigzag[i]] = (stbi__uint16)(sixteen ? stbi__get16be(z->s) : stbi__get8(z->s));
+            L -= (sixteen ? 129 : 65);
+        }
+        return L == 0;
+
+    case 0xC4: // DHT - define huffman table
+        L = stbi__get16be(z->s) - 2;
+        while (L > 0) {
+            stbi_uc * v;
+            int sizes[16], i, n = 0;
+            int q = stbi__get8(z->s);
+            int tc = q >> 4;
+            int th = q & 15;
+            if (tc > 1 || th > 3)
+                return stbi__err("bad DHT header", "Corrupt JPEG");
+            for (i = 0; i < 16; ++i) {
+                sizes[i] = stbi__get8(z->s);
+                n += sizes[i];
+            }
+            if (n > 256)
+                return stbi__err("bad DHT header", "Corrupt JPEG"); // Loop over i < n would write past end of values!
+            L -= 17;
+            if (tc == 0) {
+                if (!stbi__build_huffman(z->huff_dc + th, sizes))
+                    return 0;
+                v = z->huff_dc[th].values;
+            } else {
+                if (!stbi__build_huffman(z->huff_ac + th, sizes))
+                    return 0;
+                v = z->huff_ac[th].values;
+            }
+            for (i = 0; i < n; ++i)
+                v[i] = stbi__get8(z->s);
+            if (tc != 0)
+                stbi__build_fast_ac(z->fast_ac[th], z->huff_ac + th);
+            L -= n;
+        }
+        return L == 0;
+    }
+
+    // check for comment block or APP blocks
+    if ((m >= 0xE0 && m <= 0xEF) || m == 0xFE) {
+        L = stbi__get16be(z->s);
+        if (L < 2) {
+            if (m == 0xFE)
+                return stbi__err("bad COM len", "Corrupt JPEG");
+            else
+                return stbi__err("bad APP len", "Corrupt JPEG");
+        }
+        L -= 2;
+
+        if (m == 0xE0 && L >= 5) { // JFIF APP0 segment
+            static const unsigned char tag[5] = {'J', 'F', 'I', 'F', '\0'};
+            int ok = 1;
+            int i;
+            for (i = 0; i < 5; ++i)
+                if (stbi__get8(z->s) != tag[i])
+                    ok = 0;
+            L -= 5;
+            if (ok)
+                z->jfif = 1;
+        } else if (m == 0xEE && L >= 12) { // Adobe APP14 segment
+            static const unsigned char tag[6] = {'A', 'd', 'o', 'b', 'e', '\0'};
+            int ok = 1;
+            int i;
+            for (i = 0; i < 6; ++i)
+                if (stbi__get8(z->s) != tag[i])
+                    ok = 0;
+            L -= 6;
+            if (ok) {
+                stbi__get8(z->s);                            // version
+                stbi__get16be(z->s);                         // flags0
+                stbi__get16be(z->s);                         // flags1
+                z->app14_color_transform = stbi__get8(z->s); // color transform
+                L -= 6;
+            }
+        }
+
+        stbi__skip(z->s, L);
+        return 1;
+    }
+
+    return stbi__err("unknown marker", "Corrupt JPEG");
+}
+
+// after we see SOS
+static int stbi__process_scan_header(stbi__jpeg * z) {
+    int i;
+    int Ls = stbi__get16be(z->s);
+    z->scan_n = stbi__get8(z->s);
+    if (z->scan_n < 1 || z->scan_n > 4 || z->scan_n > (int)z->s->img_n)
+        return stbi__err("bad SOS component count", "Corrupt JPEG");
+    if (Ls != 6 + 2 * z->scan_n)
+        return stbi__err("bad SOS len", "Corrupt JPEG");
+    for (i = 0; i < z->scan_n; ++i) {
+        int id = stbi__get8(z->s), which;
+        int q = stbi__get8(z->s);
+        for (which = 0; which < z->s->img_n; ++which)
+            if (z->img_comp[which].id == id)
+                break;
+        if (which == z->s->img_n)
+            return 0; // no match
+        z->img_comp[which].hd = q >> 4;
+        if (z->img_comp[which].hd > 3)
+            return stbi__err("bad DC huff", "Corrupt JPEG");
+        z->img_comp[which].ha = q & 15;
+        if (z->img_comp[which].ha > 3)
+            return stbi__err("bad AC huff", "Corrupt JPEG");
+        z->order[i] = which;
+    }
+
+    {
+        int aa;
+        z->spec_start = stbi__get8(z->s);
+        z->spec_end = stbi__get8(z->s); // should be 63, but might be 0
+        aa = stbi__get8(z->s);
+        z->succ_high = (aa >> 4);
+        z->succ_low = (aa & 15);
+        if (z->progressive) {
+            if (z->spec_start > 63 || z->spec_end > 63 || z->spec_start > z->spec_end || z->succ_high > 13 || z->succ_low > 13)
+                return stbi__err("bad SOS", "Corrupt JPEG");
+        } else {
+            if (z->spec_start != 0)
+                return stbi__err("bad SOS", "Corrupt JPEG");
+            if (z->succ_high != 0 || z->succ_low != 0)
+                return stbi__err("bad SOS", "Corrupt JPEG");
+            z->spec_end = 63;
+        }
+    }
+
+    return 1;
+}
+
+static int stbi__free_jpeg_components(stbi__jpeg * z, int ncomp, int why) {
+    int i;
+    for (i = 0; i < ncomp; ++i) {
+        if (z->img_comp[i].raw_data) {
+            STBI_FREE(z->img_comp[i].raw_data);
+            z->img_comp[i].raw_data = NULL;
+            z->img_comp[i].data = NULL;
+        }
+        if (z->img_comp[i].raw_coeff) {
+            STBI_FREE(z->img_comp[i].raw_coeff);
+            z->img_comp[i].raw_coeff = 0;
+            z->img_comp[i].coeff = 0;
+        }
+        if (z->img_comp[i].linebuf) {
+            STBI_FREE(z->img_comp[i].linebuf);
+            z->img_comp[i].linebuf = NULL;
+        }
+    }
+    return why;
+}
+
+static int stbi__process_frame_header(stbi__jpeg * z, int scan) {
+    stbi__context * s = z->s;
+    int Lf, p, i, q, h_max = 1, v_max = 1, c;
+    Lf = stbi__get16be(s);
+    if (Lf < 11)
+        return stbi__err("bad SOF len", "Corrupt JPEG"); // JPEG
+    p = stbi__get8(s);
+    if (p != 8)
+        return stbi__err("only 8-bit", "JPEG format not supported: 8-bit only"); // JPEG baseline
+    s->img_y = stbi__get16be(s);
+    if (s->img_y == 0)
+        return stbi__err("no header height",
+                         "JPEG format not supported: delayed height"); // Legal, but we don't handle it--but neither does IJG
+    s->img_x = stbi__get16be(s);
+    if (s->img_x == 0)
+        return stbi__err("0 width", "Corrupt JPEG"); // JPEG requires
+    if (s->img_y > STBI_MAX_DIMENSIONS)
+        return stbi__err("too large", "Very large image (corrupt?)");
+    if (s->img_x > STBI_MAX_DIMENSIONS)
+        return stbi__err("too large", "Very large image (corrupt?)");
+    c = stbi__get8(s);
+    if (c != 3 && c != 1 && c != 4)
+        return stbi__err("bad component count", "Corrupt JPEG");
+    s->img_n = c;
+    for (i = 0; i < c; ++i) {
+        z->img_comp[i].data = NULL;
+        z->img_comp[i].linebuf = NULL;
+    }
+
+    if (Lf != 8 + 3 * s->img_n)
+        return stbi__err("bad SOF len", "Corrupt JPEG");
+
+    z->rgb = 0;
+    for (i = 0; i < s->img_n; ++i) {
+        static const unsigned char rgb[3] = {'R', 'G', 'B'};
+        z->img_comp[i].id = stbi__get8(s);
+        if (s->img_n == 3 && z->img_comp[i].id == rgb[i])
+            ++z->rgb;
+        q = stbi__get8(s);
+        z->img_comp[i].h = (q >> 4);
+        if (!z->img_comp[i].h || z->img_comp[i].h > 4)
+            return stbi__err("bad H", "Corrupt JPEG");
+        z->img_comp[i].v = q & 15;
+        if (!z->img_comp[i].v || z->img_comp[i].v > 4)
+            return stbi__err("bad V", "Corrupt JPEG");
+        z->img_comp[i].tq = stbi__get8(s);
+        if (z->img_comp[i].tq > 3)
+            return stbi__err("bad TQ", "Corrupt JPEG");
+    }
+
+    if (scan != STBI__SCAN_load)
+        return 1;
+
+    if (!stbi__mad3sizes_valid(s->img_x, s->img_y, s->img_n, 0))
+        return stbi__err("too large", "Image too large to decode");
+
+    for (i = 0; i < s->img_n; ++i) {
+        if (z->img_comp[i].h > h_max)
+            h_max = z->img_comp[i].h;
+        if (z->img_comp[i].v > v_max)
+            v_max = z->img_comp[i].v;
+    }
+
+    // check that plane subsampling factors are integer ratios; our resamplers can't deal with fractional ratios
+    // and I've never seen a non-corrupted JPEG file actually use them
+    for (i = 0; i < s->img_n; ++i) {
+        if (h_max % z->img_comp[i].h != 0)
+            return stbi__err("bad H", "Corrupt JPEG");
+        if (v_max % z->img_comp[i].v != 0)
+            return stbi__err("bad V", "Corrupt JPEG");
+    }
+
+    // compute interleaved mcu info
+    z->img_h_max = h_max;
+    z->img_v_max = v_max;
+    z->img_mcu_w = h_max * 8;
+    z->img_mcu_h = v_max * 8;
+    // these sizes can't be more than 17 bits
+    z->img_mcu_x = (s->img_x + z->img_mcu_w - 1) / z->img_mcu_w;
+    z->img_mcu_y = (s->img_y + z->img_mcu_h - 1) / z->img_mcu_h;
+
+    for (i = 0; i < s->img_n; ++i) {
+        // number of effective pixels (e.g. for non-interleaved MCU)
+        z->img_comp[i].x = (s->img_x * z->img_comp[i].h + h_max - 1) / h_max;
+        z->img_comp[i].y = (s->img_y * z->img_comp[i].v + v_max - 1) / v_max;
+        // to simplify generation, we'll allocate enough memory to decode
+        // the bogus oversized data from using interleaved MCUs and their
+        // big blocks (e.g. a 16x16 iMCU on an image of width 33); we won't
+        // discard the extra data until colorspace conversion
+        //
+        // img_mcu_x, img_mcu_y: <=17 bits; comp[i].h and .v are <=4 (checked earlier)
+        // so these muls can't overflow with 32-bit ints (which we require)
+        z->img_comp[i].w2 = z->img_mcu_x * z->img_comp[i].h * 8;
+        z->img_comp[i].h2 = z->img_mcu_y * z->img_comp[i].v * 8;
+        z->img_comp[i].coeff = 0;
+        z->img_comp[i].raw_coeff = 0;
+        z->img_comp[i].linebuf = NULL;
+        z->img_comp[i].raw_data = stbi__malloc_mad2(z->img_comp[i].w2, z->img_comp[i].h2, 15);
+        if (z->img_comp[i].raw_data == NULL)
+            return stbi__free_jpeg_components(z, i + 1, stbi__err("outofmem", "Out of memory"));
+        // align blocks for idct using mmx/sse
+        z->img_comp[i].data = (stbi_uc *)(((size_t)z->img_comp[i].raw_data + 15) & ~15);
+        if (z->progressive) {
+            // w2, h2 are multiples of 8 (see above)
+            z->img_comp[i].coeff_w = z->img_comp[i].w2 / 8;
+            z->img_comp[i].coeff_h = z->img_comp[i].h2 / 8;
+            z->img_comp[i].raw_coeff = stbi__malloc_mad3(z->img_comp[i].w2, z->img_comp[i].h2, sizeof(short), 15);
+            if (z->img_comp[i].raw_coeff == NULL)
+                return stbi__free_jpeg_components(z, i + 1, stbi__err("outofmem", "Out of memory"));
+            z->img_comp[i].coeff = (short *)(((size_t)z->img_comp[i].raw_coeff + 15) & ~15);
+        }
+    }
+
+    return 1;
+}
+
+// use comparisons since in some cases we handle more than one case (e.g. SOF)
+#define stbi__DNL(x) ((x) == 0xdc)
+#define stbi__SOI(x) ((x) == 0xd8)
+#define stbi__EOI(x) ((x) == 0xd9)
+#define stbi__SOF(x) ((x) == 0xc0 || (x) == 0xc1 || (x) == 0xc2)
+#define stbi__SOS(x) ((x) == 0xda)
+
+#define stbi__SOF_progressive(x) ((x) == 0xc2)
+
+static int stbi__decode_jpeg_header(stbi__jpeg * z, int scan) {
+    int m;
+    z->jfif = 0;
+    z->app14_color_transform = -1; // valid values are 0,1,2
+    z->marker = STBI__MARKER_none; // initialize cached marker to empty
+    m = stbi__get_marker(z);
+    if (!stbi__SOI(m))
+        return stbi__err("no SOI", "Corrupt JPEG");
+    if (scan == STBI__SCAN_type)
+        return 1;
+    m = stbi__get_marker(z);
+    while (!stbi__SOF(m)) {
+        if (!stbi__process_marker(z, m))
+            return 0;
+        m = stbi__get_marker(z);
+        while (m == STBI__MARKER_none) {
+            // some files have extra padding after their blocks, so ok, we'll scan
+            if (stbi__at_eof(z->s))
+                return stbi__err("no SOF", "Corrupt JPEG");
+            m = stbi__get_marker(z);
+        }
+    }
+    z->progressive = stbi__SOF_progressive(m);
+    if (!stbi__process_frame_header(z, scan))
+        return 0;
+    return 1;
+}
+
+static int stbi__skip_jpeg_junk_at_end(stbi__jpeg * j) {
+    // some JPEGs have junk at end, skip over it but if we find what looks
+    // like a valid marker, resume there
+    while (!stbi__at_eof(j->s)) {
+        int x = stbi__get8(j->s);
+        while (x == 255) { // might be a marker
+            if (stbi__at_eof(j->s))
+                return STBI__MARKER_none;
+            x = stbi__get8(j->s);
+            if (x != 0x00 && x != 0xff) {
+                // not a stuffed zero or lead-in to another marker, looks
+                // like an actual marker, return it
+                return x;
+            }
+            // stuffed zero has x=0 now which ends the loop, meaning we go
+            // back to regular scan loop.
+            // repeated 0xff keeps trying to read the next byte of the marker.
+        }
+    }
+    return STBI__MARKER_none;
+}
+
+// decode image to YCbCr format
+static int stbi__decode_jpeg_image(stbi__jpeg * j) {
+    int m;
+    for (m = 0; m < 4; m++) {
+        j->img_comp[m].raw_data = NULL;
+        j->img_comp[m].raw_coeff = NULL;
+    }
+    j->restart_interval = 0;
+    if (!stbi__decode_jpeg_header(j, STBI__SCAN_load))
+        return 0;
+    m = stbi__get_marker(j);
+    while (!stbi__EOI(m)) {
+        if (stbi__SOS(m)) {
+            if (!stbi__process_scan_header(j))
+                return 0;
+            if (!stbi__parse_entropy_coded_data(j))
+                return 0;
+            if (j->marker == STBI__MARKER_none) {
+                j->marker = stbi__skip_jpeg_junk_at_end(j);
+                // if we reach eof without hitting a marker, stbi__get_marker() below will fail and we'll eventually return 0
+            }
+            m = stbi__get_marker(j);
+            if (STBI__RESTART(m))
+                m = stbi__get_marker(j);
+        } else if (stbi__DNL(m)) {
+            int Ld = stbi__get16be(j->s);
+            stbi__uint32 NL = stbi__get16be(j->s);
+            if (Ld != 4)
+                return stbi__err("bad DNL len", "Corrupt JPEG");
+            if (NL != j->s->img_y)
+                return stbi__err("bad DNL height", "Corrupt JPEG");
+            m = stbi__get_marker(j);
+        } else {
+            if (!stbi__process_marker(j, m))
+                return 1;
+            m = stbi__get_marker(j);
+        }
+    }
+    if (j->progressive)
+        stbi__jpeg_finish(j);
+    return 1;
+}
+
+// static jfif-centered resampling (across block boundaries)
+
+typedef stbi_uc * (*resample_row_func)(stbi_uc * out, stbi_uc * in0, stbi_uc * in1, int w, int hs);
+
+#define stbi__div4(x) ((stbi_uc)((x) >> 2))
+
+static stbi_uc * resample_row_1(stbi_uc * out, stbi_uc * in_near, stbi_uc * in_far, int w, int hs) {
+    STBI_NOTUSED(out);
+    STBI_NOTUSED(in_far);
+    STBI_NOTUSED(w);
+    STBI_NOTUSED(hs);
+    return in_near;
+}
+
+static stbi_uc * stbi__resample_row_v_2(stbi_uc * out, stbi_uc * in_near, stbi_uc * in_far, int w, int hs) {
+    // need to generate two samples vertically for every one in input
+    int i;
+    STBI_NOTUSED(hs);
+    for (i = 0; i < w; ++i)
+        out[i] = stbi__div4(3 * in_near[i] + in_far[i] + 2);
+    return out;
+}
+
+static stbi_uc * stbi__resample_row_h_2(stbi_uc * out, stbi_uc * in_near, stbi_uc * in_far, int w, int hs) {
+    // need to generate two samples horizontally for every one in input
+    int i;
+    stbi_uc * input = in_near;
+
+    if (w == 1) {
+        // if only one sample, can't do any interpolation
+        out[0] = out[1] = input[0];
+        return out;
+    }
+
+    out[0] = input[0];
+    out[1] = stbi__div4(input[0] * 3 + input[1] + 2);
+    for (i = 1; i < w - 1; ++i) {
+        int n = 3 * input[i] + 2;
+        out[i * 2 + 0] = stbi__div4(n + input[i - 1]);
+        out[i * 2 + 1] = stbi__div4(n + input[i + 1]);
+    }
+    out[i * 2 + 0] = stbi__div4(input[w - 2] * 3 + input[w - 1] + 2);
+    out[i * 2 + 1] = input[w - 1];
+
+    STBI_NOTUSED(in_far);
+    STBI_NOTUSED(hs);
+
+    return out;
+}
+
+#define stbi__div16(x) ((stbi_uc)((x) >> 4))
+
+static stbi_uc * stbi__resample_row_hv_2(stbi_uc * out, stbi_uc * in_near, stbi_uc * in_far, int w, int hs) {
+    // need to generate 2x2 samples for every one in input
+    int i, t0, t1;
+    if (w == 1) {
+        out[0] = out[1] = stbi__div4(3 * in_near[0] + in_far[0] + 2);
+        return out;
+    }
+
+    t1 = 3 * in_near[0] + in_far[0];
+    out[0] = stbi__div4(t1 + 2);
+    for (i = 1; i < w; ++i) {
+        t0 = t1;
+        t1 = 3 * in_near[i] + in_far[i];
+        out[i * 2 - 1] = stbi__div16(3 * t0 + t1 + 8);
+        out[i * 2] = stbi__div16(3 * t1 + t0 + 8);
+    }
+    out[w * 2 - 1] = stbi__div4(t1 + 2);
+
+    STBI_NOTUSED(hs);
+
+    return out;
+}
+
+#if defined(STBI_SSE2) || defined(STBI_NEON)
+static stbi_uc * stbi__resample_row_hv_2_simd(stbi_uc * out, stbi_uc * in_near, stbi_uc * in_far, int w, int hs) {
+    // need to generate 2x2 samples for every one in input
+    int i = 0, t0, t1;
+
+    if (w == 1) {
+        out[0] = out[1] = stbi__div4(3 * in_near[0] + in_far[0] + 2);
+        return out;
+    }
+
+    t1 = 3 * in_near[0] + in_far[0];
+    // process groups of 8 pixels for as long as we can.
+    // note we can't handle the last pixel in a row in this loop
+    // because we need to handle the filter boundary conditions.
+    for (; i < ((w - 1) & ~7); i += 8) {
+#if defined(STBI_SSE2)
+        // load and perform the vertical filtering pass
+        // this uses 3*x + y = 4*x + (y - x)
+        __m128i zero = _mm_setzero_si128();
+        __m128i farb = _mm_loadl_epi64((__m128i *)(in_far + i));
+        __m128i nearb = _mm_loadl_epi64((__m128i *)(in_near + i));
+        __m128i farw = _mm_unpacklo_epi8(farb, zero);
+        __m128i nearw = _mm_unpacklo_epi8(nearb, zero);
+        __m128i diff = _mm_sub_epi16(farw, nearw);
+        __m128i nears = _mm_slli_epi16(nearw, 2);
+        __m128i curr = _mm_add_epi16(nears, diff); // current row
+
+        // horizontal filter works the same based on shifted vers of current
+        // row. "prev" is current row shifted right by 1 pixel; we need to
+        // insert the previous pixel value (from t1).
+        // "next" is current row shifted left by 1 pixel, with first pixel
+        // of next block of 8 pixels added in.
+        __m128i prv0 = _mm_slli_si128(curr, 2);
+        __m128i nxt0 = _mm_srli_si128(curr, 2);
+        __m128i prev = _mm_insert_epi16(prv0, t1, 0);
+        __m128i next = _mm_insert_epi16(nxt0, 3 * in_near[i + 8] + in_far[i + 8], 7);
+
+        // horizontal filter, polyphase implementation since it's convenient:
+        // even pixels = 3*cur + prev = cur*4 + (prev - cur)
+        // odd  pixels = 3*cur + next = cur*4 + (next - cur)
+        // note the shared term.
+        __m128i bias = _mm_set1_epi16(8);
+        __m128i curs = _mm_slli_epi16(curr, 2);
+        __m128i prvd = _mm_sub_epi16(prev, curr);
+        __m128i nxtd = _mm_sub_epi16(next, curr);
+        __m128i curb = _mm_add_epi16(curs, bias);
+        __m128i even = _mm_add_epi16(prvd, curb);
+        __m128i odd = _mm_add_epi16(nxtd, curb);
+
+        // interleave even and odd pixels, then undo scaling.
+        __m128i int0 = _mm_unpacklo_epi16(even, odd);
+        __m128i int1 = _mm_unpackhi_epi16(even, odd);
+        __m128i de0 = _mm_srli_epi16(int0, 4);
+        __m128i de1 = _mm_srli_epi16(int1, 4);
+
+        // pack and write output
+        __m128i outv = _mm_packus_epi16(de0, de1);
+        _mm_storeu_si128((__m128i *)(out + i * 2), outv);
+#elif defined(STBI_NEON)
+        // load and perform the vertical filtering pass
+        // this uses 3*x + y = 4*x + (y - x)
+        uint8x8_t farb = vld1_u8(in_far + i);
+        uint8x8_t nearb = vld1_u8(in_near + i);
+        int16x8_t diff = vreinterpretq_s16_u16(vsubl_u8(farb, nearb));
+        int16x8_t nears = vreinterpretq_s16_u16(vshll_n_u8(nearb, 2));
+        int16x8_t curr = vaddq_s16(nears, diff); // current row
+
+        // horizontal filter works the same based on shifted vers of current
+        // row. "prev" is current row shifted right by 1 pixel; we need to
+        // insert the previous pixel value (from t1).
+        // "next" is current row shifted left by 1 pixel, with first pixel
+        // of next block of 8 pixels added in.
+        int16x8_t prv0 = vextq_s16(curr, curr, 7);
+        int16x8_t nxt0 = vextq_s16(curr, curr, 1);
+        int16x8_t prev = vsetq_lane_s16(t1, prv0, 0);
+        int16x8_t next = vsetq_lane_s16(3 * in_near[i + 8] + in_far[i + 8], nxt0, 7);
+
+        // horizontal filter, polyphase implementation since it's convenient:
+        // even pixels = 3*cur + prev = cur*4 + (prev - cur)
+        // odd  pixels = 3*cur + next = cur*4 + (next - cur)
+        // note the shared term.
+        int16x8_t curs = vshlq_n_s16(curr, 2);
+        int16x8_t prvd = vsubq_s16(prev, curr);
+        int16x8_t nxtd = vsubq_s16(next, curr);
+        int16x8_t even = vaddq_s16(curs, prvd);
+        int16x8_t odd = vaddq_s16(curs, nxtd);
+
+        // undo scaling and round, then store with even/odd phases interleaved
+        uint8x8x2_t o;
+        o.val[0] = vqrshrun_n_s16(even, 4);
+        o.val[1] = vqrshrun_n_s16(odd, 4);
+        vst2_u8(out + i * 2, o);
+#endif
+
+        // "previous" value for next iter
+        t1 = 3 * in_near[i + 7] + in_far[i + 7];
+    }
+
+    t0 = t1;
+    t1 = 3 * in_near[i] + in_far[i];
+    out[i * 2] = stbi__div16(3 * t1 + t0 + 8);
+
+    for (++i; i < w; ++i) {
+        t0 = t1;
+        t1 = 3 * in_near[i] + in_far[i];
+        out[i * 2 - 1] = stbi__div16(3 * t0 + t1 + 8);
+        out[i * 2] = stbi__div16(3 * t1 + t0 + 8);
+    }
+    out[w * 2 - 1] = stbi__div4(t1 + 2);
+
+    STBI_NOTUSED(hs);
+
+    return out;
+}
+#endif
+
+static stbi_uc * stbi__resample_row_generic(stbi_uc * out, stbi_uc * in_near, stbi_uc * in_far, int w, int hs) {
+    // resample with nearest-neighbor
+    int i, j;
+    STBI_NOTUSED(in_far);
+    for (i = 0; i < w; ++i)
+        for (j = 0; j < hs; ++j)
+            out[i * hs + j] = in_near[i];
+    return out;
+}
+
+// this is a reduced-precision calculation of YCbCr-to-RGB introduced
+// to make sure the code produces the same results in both SIMD and scalar
+#define stbi__float2fixed(x) (((int)((x)*4096.0f + 0.5f)) << 8)
+static void stbi__YCbCr_to_RGB_row(stbi_uc * out, const stbi_uc * y, const stbi_uc * pcb, const stbi_uc * pcr, int count,
+                                   int step) {
+    int i;
+    for (i = 0; i < count; ++i) {
+        int y_fixed = (y[i] << 20) + (1 << 19); // rounding
+        int r, g, b;
+        int cr = pcr[i] - 128;
+        int cb = pcb[i] - 128;
+        r = y_fixed + cr * stbi__float2fixed(1.40200f);
+        g = y_fixed + (cr * -stbi__float2fixed(0.71414f)) + ((cb * -stbi__float2fixed(0.34414f)) & 0xffff0000);
+        b = y_fixed + cb * stbi__float2fixed(1.77200f);
+        r >>= 20;
+        g >>= 20;
+        b >>= 20;
+        if ((unsigned)r > 255) {
+            if (r < 0)
+                r = 0;
+            else
+                r = 255;
+        }
+        if ((unsigned)g > 255) {
+            if (g < 0)
+                g = 0;
+            else
+                g = 255;
+        }
+        if ((unsigned)b > 255) {
+            if (b < 0)
+                b = 0;
+            else
+                b = 255;
+        }
+        out[0] = (stbi_uc)r;
+        out[1] = (stbi_uc)g;
+        out[2] = (stbi_uc)b;
+        out[3] = 255;
+        out += step;
+    }
+}
+
+#if defined(STBI_SSE2) || defined(STBI_NEON)
+static void stbi__YCbCr_to_RGB_simd(stbi_uc * out, stbi_uc const * y, stbi_uc const * pcb, stbi_uc const * pcr, int count,
+                                    int step) {
+    int i = 0;
+
+#ifdef STBI_SSE2
+    // step == 3 is pretty ugly on the final interleave, and i'm not convinced
+    // it's useful in practice (you wouldn't use it for textures, for example).
+    // so just accelerate step == 4 case.
+    if (step == 4) {
+        // this is a fairly straightforward implementation and not super-optimized.
+        __m128i signflip = _mm_set1_epi8(-0x80);
+        __m128i cr_const0 = _mm_set1_epi16((short)(1.40200f * 4096.0f + 0.5f));
+        __m128i cr_const1 = _mm_set1_epi16(-(short)(0.71414f * 4096.0f + 0.5f));
+        __m128i cb_const0 = _mm_set1_epi16(-(short)(0.34414f * 4096.0f + 0.5f));
+        __m128i cb_const1 = _mm_set1_epi16((short)(1.77200f * 4096.0f + 0.5f));
+        __m128i y_bias = _mm_set1_epi8((char)(unsigned char)128);
+        __m128i xw = _mm_set1_epi16(255); // alpha channel
+
+        for (; i + 7 < count; i += 8) {
+            // load
+            __m128i y_bytes = _mm_loadl_epi64((__m128i *)(y + i));
+            __m128i cr_bytes = _mm_loadl_epi64((__m128i *)(pcr + i));
+            __m128i cb_bytes = _mm_loadl_epi64((__m128i *)(pcb + i));
+            __m128i cr_biased = _mm_xor_si128(cr_bytes, signflip); // -128
+            __m128i cb_biased = _mm_xor_si128(cb_bytes, signflip); // -128
+
+            // unpack to short (and left-shift cr, cb by 8)
+            __m128i yw = _mm_unpacklo_epi8(y_bias, y_bytes);
+            __m128i crw = _mm_unpacklo_epi8(_mm_setzero_si128(), cr_biased);
+            __m128i cbw = _mm_unpacklo_epi8(_mm_setzero_si128(), cb_biased);
+
+            // color transform
+            __m128i yws = _mm_srli_epi16(yw, 4);
+            __m128i cr0 = _mm_mulhi_epi16(cr_const0, crw);
+            __m128i cb0 = _mm_mulhi_epi16(cb_const0, cbw);
+            __m128i cb1 = _mm_mulhi_epi16(cbw, cb_const1);
+            __m128i cr1 = _mm_mulhi_epi16(crw, cr_const1);
+            __m128i rws = _mm_add_epi16(cr0, yws);
+            __m128i gwt = _mm_add_epi16(cb0, yws);
+            __m128i bws = _mm_add_epi16(yws, cb1);
+            __m128i gws = _mm_add_epi16(gwt, cr1);
+
+            // descale
+            __m128i rw = _mm_srai_epi16(rws, 4);
+            __m128i bw = _mm_srai_epi16(bws, 4);
+            __m128i gw = _mm_srai_epi16(gws, 4);
+
+            // back to byte, set up for transpose
+            __m128i brb = _mm_packus_epi16(rw, bw);
+            __m128i gxb = _mm_packus_epi16(gw, xw);
+
+            // transpose to interleave channels
+            __m128i t0 = _mm_unpacklo_epi8(brb, gxb);
+            __m128i t1 = _mm_unpackhi_epi8(brb, gxb);
+            __m128i o0 = _mm_unpacklo_epi16(t0, t1);
+            __m128i o1 = _mm_unpackhi_epi16(t0, t1);
+
+            // store
+            _mm_storeu_si128((__m128i *)(out + 0), o0);
+            _mm_storeu_si128((__m128i *)(out + 16), o1);
+            out += 32;
+        }
+    }
+#endif
+
+#ifdef STBI_NEON
+    // in this version, step=3 support would be easy to add. but is there demand?
+    if (step == 4) {
+        // this is a fairly straightforward implementation and not super-optimized.
+        uint8x8_t signflip = vdup_n_u8(0x80);
+        int16x8_t cr_const0 = vdupq_n_s16((short)(1.40200f * 4096.0f + 0.5f));
+        int16x8_t cr_const1 = vdupq_n_s16(-(short)(0.71414f * 4096.0f + 0.5f));
+        int16x8_t cb_const0 = vdupq_n_s16(-(short)(0.34414f * 4096.0f + 0.5f));
+        int16x8_t cb_const1 = vdupq_n_s16((short)(1.77200f * 4096.0f + 0.5f));
+
+        for (; i + 7 < count; i += 8) {
+            // load
+            uint8x8_t y_bytes = vld1_u8(y + i);
+            uint8x8_t cr_bytes = vld1_u8(pcr + i);
+            uint8x8_t cb_bytes = vld1_u8(pcb + i);
+            int8x8_t cr_biased = vreinterpret_s8_u8(vsub_u8(cr_bytes, signflip));
+            int8x8_t cb_biased = vreinterpret_s8_u8(vsub_u8(cb_bytes, signflip));
+
+            // expand to s16
+            int16x8_t yws = vreinterpretq_s16_u16(vshll_n_u8(y_bytes, 4));
+            int16x8_t crw = vshll_n_s8(cr_biased, 7);
+            int16x8_t cbw = vshll_n_s8(cb_biased, 7);
+
+            // color transform
+            int16x8_t cr0 = vqdmulhq_s16(crw, cr_const0);
+            int16x8_t cb0 = vqdmulhq_s16(cbw, cb_const0);
+            int16x8_t cr1 = vqdmulhq_s16(crw, cr_const1);
+            int16x8_t cb1 = vqdmulhq_s16(cbw, cb_const1);
+            int16x8_t rws = vaddq_s16(yws, cr0);
+            int16x8_t gws = vaddq_s16(vaddq_s16(yws, cb0), cr1);
+            int16x8_t bws = vaddq_s16(yws, cb1);
+
+            // undo scaling, round, convert to byte
+            uint8x8x4_t o;
+            o.val[0] = vqrshrun_n_s16(rws, 4);
+            o.val[1] = vqrshrun_n_s16(gws, 4);
+            o.val[2] = vqrshrun_n_s16(bws, 4);
+            o.val[3] = vdup_n_u8(255);
+
+            // store, interleaving r/g/b/a
+            vst4_u8(out, o);
+            out += 8 * 4;
+        }
+    }
+#endif
+
+    for (; i < count; ++i) {
+        int y_fixed = (y[i] << 20) + (1 << 19); // rounding
+        int r, g, b;
+        int cr = pcr[i] - 128;
+        int cb = pcb[i] - 128;
+        r = y_fixed + cr * stbi__float2fixed(1.40200f);
+        g = y_fixed + cr * -stbi__float2fixed(0.71414f) + ((cb * -stbi__float2fixed(0.34414f)) & 0xffff0000);
+        b = y_fixed + cb * stbi__float2fixed(1.77200f);
+        r >>= 20;
+        g >>= 20;
+        b >>= 20;
+        if ((unsigned)r > 255) {
+            if (r < 0)
+                r = 0;
+            else
+                r = 255;
+        }
+        if ((unsigned)g > 255) {
+            if (g < 0)
+                g = 0;
+            else
+                g = 255;
+        }
+        if ((unsigned)b > 255) {
+            if (b < 0)
+                b = 0;
+            else
+                b = 255;
+        }
+        out[0] = (stbi_uc)r;
+        out[1] = (stbi_uc)g;
+        out[2] = (stbi_uc)b;
+        out[3] = 255;
+        out += step;
+    }
+}
+#endif
+
+// set up the kernels
+static void stbi__setup_jpeg(stbi__jpeg * j) {
+    j->idct_block_kernel = stbi__idct_block;
+    j->YCbCr_to_RGB_kernel = stbi__YCbCr_to_RGB_row;
+    j->resample_row_hv_2_kernel = stbi__resample_row_hv_2;
+
+#ifdef STBI_SSE2
+    if (stbi__sse2_available()) {
+        j->idct_block_kernel = stbi__idct_simd;
+        j->YCbCr_to_RGB_kernel = stbi__YCbCr_to_RGB_simd;
+        j->resample_row_hv_2_kernel = stbi__resample_row_hv_2_simd;
+    }
+#endif
+
+#ifdef STBI_NEON
+    j->idct_block_kernel = stbi__idct_simd;
+    j->YCbCr_to_RGB_kernel = stbi__YCbCr_to_RGB_simd;
+    j->resample_row_hv_2_kernel = stbi__resample_row_hv_2_simd;
+#endif
+}
+
+// clean up the temporary component buffers
+static void stbi__cleanup_jpeg(stbi__jpeg * j) { stbi__free_jpeg_components(j, j->s->img_n, 0); }
+
+typedef struct {
+    resample_row_func resample;
+    stbi_uc *line0, *line1;
+    int hs, vs;  // expansion factor in each axis
+    int w_lores; // horizontal pixels pre-expansion
+    int ystep;   // how far through vertical expansion we are
+    int ypos;    // which pre-expansion row we're on
+} stbi__resample;
+
+// fast 0..255 * 0..255 => 0..255 rounded multiplication
+static stbi_uc stbi__blinn_8x8(stbi_uc x, stbi_uc y) {
+    unsigned int t = x * y + 128;
+    return (stbi_uc)((t + (t >> 8)) >> 8);
+}
+
+static stbi_uc * load_jpeg_image(stbi__jpeg * z, int * out_x, int * out_y, int * comp, int req_comp) {
+    int n, decode_n, is_rgb;
+    z->s->img_n = 0; // make stbi__cleanup_jpeg safe
+
+    // validate req_comp
+    if (req_comp < 0 || req_comp > 4)
+        return stbi__errpuc("bad req_comp", "Internal error");
+
+    // load a jpeg image from whichever source, but leave in YCbCr format
+    if (!stbi__decode_jpeg_image(z)) {
+        stbi__cleanup_jpeg(z);
+        return NULL;
+    }
+
+    // determine actual number of components to generate
+    n = req_comp ? req_comp : z->s->img_n >= 3 ? 3 : 1;
+
+    is_rgb = z->s->img_n == 3 && (z->rgb == 3 || (z->app14_color_transform == 0 && !z->jfif));
+
+    if (z->s->img_n == 3 && n < 3 && !is_rgb)
+        decode_n = 1;
+    else
+        decode_n = z->s->img_n;
+
+    // nothing to do if no components requested; check this now to avoid
+    // accessing uninitialized coutput[0] later
+    if (decode_n <= 0) {
+        stbi__cleanup_jpeg(z);
+        return NULL;
+    }
+
+    // resample and color-convert
+    {
+        int k;
+        unsigned int i, j;
+        stbi_uc * output;
+        stbi_uc * coutput[4] = {NULL, NULL, NULL, NULL};
+
+        stbi__resample res_comp[4];
+
+        for (k = 0; k < decode_n; ++k) {
+            stbi__resample * r = &res_comp[k];
+
+            // allocate line buffer big enough for upsampling off the edges
+            // with upsample factor of 4
+            z->img_comp[k].linebuf = (stbi_uc *)stbi__malloc(z->s->img_x + 3);
+            if (!z->img_comp[k].linebuf) {
+                stbi__cleanup_jpeg(z);
+                return stbi__errpuc("outofmem", "Out of memory");
+            }
+
+            r->hs = z->img_h_max / z->img_comp[k].h;
+            r->vs = z->img_v_max / z->img_comp[k].v;
+            r->ystep = r->vs >> 1;
+            r->w_lores = (z->s->img_x + r->hs - 1) / r->hs;
+            r->ypos = 0;
+            r->line0 = r->line1 = z->img_comp[k].data;
+
+            if (r->hs == 1 && r->vs == 1)
+                r->resample = resample_row_1;
+            else if (r->hs == 1 && r->vs == 2)
+                r->resample = stbi__resample_row_v_2;
+            else if (r->hs == 2 && r->vs == 1)
+                r->resample = stbi__resample_row_h_2;
+            else if (r->hs == 2 && r->vs == 2)
+                r->resample = z->resample_row_hv_2_kernel;
+            else
+                r->resample = stbi__resample_row_generic;
+        }
+
+        // can't error after this so, this is safe
+        output = (stbi_uc *)stbi__malloc_mad3(n, z->s->img_x, z->s->img_y, 1);
+        if (!output) {
+            stbi__cleanup_jpeg(z);
+            return stbi__errpuc("outofmem", "Out of memory");
+        }
+
+        // now go ahead and resample
+        for (j = 0; j < z->s->img_y; ++j) {
+            stbi_uc * out = output + n * z->s->img_x * j;
+            for (k = 0; k < decode_n; ++k) {
+                stbi__resample * r = &res_comp[k];
+                int y_bot = r->ystep >= (r->vs >> 1);
+                coutput[k] = r->resample(z->img_comp[k].linebuf, y_bot ? r->line1 : r->line0, y_bot ? r->line0 : r->line1,
+                                         r->w_lores, r->hs);
+                if (++r->ystep >= r->vs) {
+                    r->ystep = 0;
+                    r->line0 = r->line1;
+                    if (++r->ypos < z->img_comp[k].y)
+                        r->line1 += z->img_comp[k].w2;
+                }
+            }
+            if (n >= 3) {
+                stbi_uc * y = coutput[0];
+                if (z->s->img_n == 3) {
+                    if (is_rgb) {
+                        for (i = 0; i < z->s->img_x; ++i) {
+                            out[0] = y[i];
+                            out[1] = coutput[1][i];
+                            out[2] = coutput[2][i];
+                            out[3] = 255;
+                            out += n;
+                        }
+                    } else {
+                        z->YCbCr_to_RGB_kernel(out, y, coutput[1], coutput[2], z->s->img_x, n);
+                    }
+                } else if (z->s->img_n == 4) {
+                    if (z->app14_color_transform == 0) { // CMYK
+                        for (i = 0; i < z->s->img_x; ++i) {
+                            stbi_uc m = coutput[3][i];
+                            out[0] = stbi__blinn_8x8(coutput[0][i], m);
+                            out[1] = stbi__blinn_8x8(coutput[1][i], m);
+                            out[2] = stbi__blinn_8x8(coutput[2][i], m);
+                            out[3] = 255;
+                            out += n;
+                        }
+                    } else if (z->app14_color_transform == 2) { // YCCK
+                        z->YCbCr_to_RGB_kernel(out, y, coutput[1], coutput[2], z->s->img_x, n);
+                        for (i = 0; i < z->s->img_x; ++i) {
+                            stbi_uc m = coutput[3][i];
+                            out[0] = stbi__blinn_8x8(255 - out[0], m);
+                            out[1] = stbi__blinn_8x8(255 - out[1], m);
+                            out[2] = stbi__blinn_8x8(255 - out[2], m);
+                            out += n;
+                        }
+                    } else { // YCbCr + alpha?  Ignore the fourth channel for now
+                        z->YCbCr_to_RGB_kernel(out, y, coutput[1], coutput[2], z->s->img_x, n);
+                    }
+                } else
+                    for (i = 0; i < z->s->img_x; ++i) {
+                        out[0] = out[1] = out[2] = y[i];
+                        out[3] = 255; // not used if n==3
+                        out += n;
+                    }
+            } else {
+                if (is_rgb) {
+                    if (n == 1)
+                        for (i = 0; i < z->s->img_x; ++i)
+                            *out++ = stbi__compute_y(coutput[0][i], coutput[1][i], coutput[2][i]);
+                    else {
+                        for (i = 0; i < z->s->img_x; ++i, out += 2) {
+                            out[0] = stbi__compute_y(coutput[0][i], coutput[1][i], coutput[2][i]);
+                            out[1] = 255;
+                        }
+                    }
+                } else if (z->s->img_n == 4 && z->app14_color_transform == 0) {
+                    for (i = 0; i < z->s->img_x; ++i) {
+                        stbi_uc m = coutput[3][i];
+                        stbi_uc r = stbi__blinn_8x8(coutput[0][i], m);
+                        stbi_uc g = stbi__blinn_8x8(coutput[1][i], m);
+                        stbi_uc b = stbi__blinn_8x8(coutput[2][i], m);
+                        out[0] = stbi__compute_y(r, g, b);
+                        out[1] = 255;
+                        out += n;
+                    }
+                } else if (z->s->img_n == 4 && z->app14_color_transform == 2) {
+                    for (i = 0; i < z->s->img_x; ++i) {
+                        out[0] = stbi__blinn_8x8(255 - coutput[0][i], coutput[3][i]);
+                        out[1] = 255;
+                        out += n;
+                    }
+                } else {
+                    stbi_uc * y = coutput[0];
+                    if (n == 1)
+                        for (i = 0; i < z->s->img_x; ++i)
+                            out[i] = y[i];
+                    else
+                        for (i = 0; i < z->s->img_x; ++i) {
+                            *out++ = y[i];
+                            *out++ = 255;
+                        }
+                }
+            }
+        }
+        stbi__cleanup_jpeg(z);
+        *out_x = z->s->img_x;
+        *out_y = z->s->img_y;
+        if (comp)
+            *comp = z->s->img_n >= 3 ? 3 : 1; // report original components, not output
+        return output;
+    }
+}
+
+static void * stbi__jpeg_load(stbi__context * s, int * x, int * y, int * comp, int req_comp, stbi__result_info * ri) {
+    unsigned char * result;
+    stbi__jpeg * j = (stbi__jpeg *)stbi__malloc(sizeof(stbi__jpeg));
+    if (!j)
+        return stbi__errpuc("outofmem", "Out of memory");
+    memset(j, 0, sizeof(stbi__jpeg));
+    STBI_NOTUSED(ri);
+    j->s = s;
+    stbi__setup_jpeg(j);
+    result = load_jpeg_image(j, x, y, comp, req_comp);
+    STBI_FREE(j);
+    return result;
+}
+
+static int stbi__jpeg_test(stbi__context * s) {
+    int r;
+    stbi__jpeg * j = (stbi__jpeg *)stbi__malloc(sizeof(stbi__jpeg));
+    if (!j)
+        return stbi__err("outofmem", "Out of memory");
+    memset(j, 0, sizeof(stbi__jpeg));
+    j->s = s;
+    stbi__setup_jpeg(j);
+    r = stbi__decode_jpeg_header(j, STBI__SCAN_type);
+    stbi__rewind(s);
+    STBI_FREE(j);
+    return r;
+}
+
+static int stbi__jpeg_info_raw(stbi__jpeg * j, int * x, int * y, int * comp) {
+    if (!stbi__decode_jpeg_header(j, STBI__SCAN_header)) {
+        stbi__rewind(j->s);
+        return 0;
+    }
+    if (x)
+        *x = j->s->img_x;
+    if (y)
+        *y = j->s->img_y;
+    if (comp)
+        *comp = j->s->img_n >= 3 ? 3 : 1;
+    return 1;
+}
+
+static int stbi__jpeg_info(stbi__context * s, int * x, int * y, int * comp) {
+    int result;
+    stbi__jpeg * j = (stbi__jpeg *)(stbi__malloc(sizeof(stbi__jpeg)));
+    if (!j)
+        return stbi__err("outofmem", "Out of memory");
+    memset(j, 0, sizeof(stbi__jpeg));
+    j->s = s;
+    result = stbi__jpeg_info_raw(j, x, y, comp);
+    STBI_FREE(j);
+    return result;
+}
+#endif
+
+// public domain zlib decode    v0.2  Sean Barrett 2006-11-18
+//    simple implementation
+//      - all input must be provided in an upfront buffer
+//      - all output is written to a single output buffer (can malloc/realloc)
+//    performance
+//      - fast huffman
+
+#ifndef STBI_NO_ZLIB
+
+// fast-way is faster to check than jpeg huffman, but slow way is slower
+#define STBI__ZFAST_BITS 9 // accelerate all cases in default tables
+#define STBI__ZFAST_MASK ((1 << STBI__ZFAST_BITS) - 1)
+#define STBI__ZNSYMS 288 // number of symbols in literal/length alphabet
+
+// zlib-style huffman encoding
+// (jpegs packs from left, zlib from right, so can't share code)
+typedef struct {
+    stbi__uint16 fast[1 << STBI__ZFAST_BITS];
+    stbi__uint16 firstcode[16];
+    int maxcode[17];
+    stbi__uint16 firstsymbol[16];
+    stbi_uc size[STBI__ZNSYMS];
+    stbi__uint16 value[STBI__ZNSYMS];
+} stbi__zhuffman;
+
+stbi_inline static int stbi__bitreverse16(int n) {
+    n = ((n & 0xAAAA) >> 1) | ((n & 0x5555) << 1);
+    n = ((n & 0xCCCC) >> 2) | ((n & 0x3333) << 2);
+    n = ((n & 0xF0F0) >> 4) | ((n & 0x0F0F) << 4);
+    n = ((n & 0xFF00) >> 8) | ((n & 0x00FF) << 8);
+    return n;
+}
+
+stbi_inline static int stbi__bit_reverse(int v, int bits) {
+    STBI_ASSERT(bits <= 16);
+    // to bit reverse n bits, reverse 16 and shift
+    // e.g. 11 bits, bit reverse and shift away 5
+    return stbi__bitreverse16(v) >> (16 - bits);
+}
+
+static int stbi__zbuild_huffman(stbi__zhuffman * z, const stbi_uc * sizelist, int num) {
+    int i, k = 0;
+    int code, next_code[16], sizes[17];
+
+    // DEFLATE spec for generating codes
+    memset(sizes, 0, sizeof(sizes));
+    memset(z->fast, 0, sizeof(z->fast));
+    for (i = 0; i < num; ++i)
+        ++sizes[sizelist[i]];
+    sizes[0] = 0;
+    for (i = 1; i < 16; ++i)
+        if (sizes[i] > (1 << i))
+            return stbi__err("bad sizes", "Corrupt PNG");
+    code = 0;
+    for (i = 1; i < 16; ++i) {
+        next_code[i] = code;
+        z->firstcode[i] = (stbi__uint16)code;
+        z->firstsymbol[i] = (stbi__uint16)k;
+        code = (code + sizes[i]);
+        if (sizes[i])
+            if (code - 1 >= (1 << i))
+                return stbi__err("bad codelengths", "Corrupt PNG");
+        z->maxcode[i] = code << (16 - i); // preshift for inner loop
+        code <<= 1;
+        k += sizes[i];
+    }
+    z->maxcode[16] = 0x10000; // sentinel
+    for (i = 0; i < num; ++i) {
+        int s = sizelist[i];
+        if (s) {
+            int c = next_code[s] - z->firstcode[s] + z->firstsymbol[s];
+            stbi__uint16 fastv = (stbi__uint16)((s << 9) | i);
+            z->size[c] = (stbi_uc)s;
+            z->value[c] = (stbi__uint16)i;
+            if (s <= STBI__ZFAST_BITS) {
+                int j = stbi__bit_reverse(next_code[s], s);
+                while (j < (1 << STBI__ZFAST_BITS)) {
+                    z->fast[j] = fastv;
+                    j += (1 << s);
+                }
+            }
+            ++next_code[s];
+        }
+    }
+    return 1;
+}
+
+// zlib-from-memory implementation for PNG reading
+//    because PNG allows splitting the zlib stream arbitrarily,
+//    and it's annoying structurally to have PNG call ZLIB call PNG,
+//    we require PNG read all the IDATs and combine them into a single
+//    memory buffer
+
+typedef struct {
+    stbi_uc *zbuffer, *zbuffer_end;
+    int num_bits;
+    stbi__uint32 code_buffer;
+
+    char * zout;
+    char * zout_start;
+    char * zout_end;
+    int z_expandable;
+
+    stbi__zhuffman z_length, z_distance;
+} stbi__zbuf;
+
+stbi_inline static int stbi__zeof(stbi__zbuf * z) { return (z->zbuffer >= z->zbuffer_end); }
+
+stbi_inline static stbi_uc stbi__zget8(stbi__zbuf * z) { return stbi__zeof(z) ? 0 : *z->zbuffer++; }
+
+static void stbi__fill_bits(stbi__zbuf * z) {
+    do {
+        if (z->code_buffer >= (1U << z->num_bits)) {
+            z->zbuffer = z->zbuffer_end; /* treat this as EOF so we fail. */
+            return;
+        }
+        z->code_buffer |= (unsigned int)stbi__zget8(z) << z->num_bits;
+        z->num_bits += 8;
+    } while (z->num_bits <= 24);
+}
+
+stbi_inline static unsigned int stbi__zreceive(stbi__zbuf * z, int n) {
+    unsigned int k;
+    if (z->num_bits < n)
+        stbi__fill_bits(z);
+    k = z->code_buffer & ((1 << n) - 1);
+    z->code_buffer >>= n;
+    z->num_bits -= n;
+    return k;
+}
+
+static int stbi__zhuffman_decode_slowpath(stbi__zbuf * a, stbi__zhuffman * z) {
+    int b, s, k;
+    // not resolved by fast table, so compute it the slow way
+    // use jpeg approach, which requires MSbits at top
+    k = stbi__bit_reverse(a->code_buffer, 16);
+    for (s = STBI__ZFAST_BITS + 1;; ++s)
+        if (k < z->maxcode[s])
+            break;
+    if (s >= 16)
+        return -1; // invalid code!
+    // code size is s, so:
+    b = (k >> (16 - s)) - z->firstcode[s] + z->firstsymbol[s];
+    if (b >= STBI__ZNSYMS)
+        return -1; // some data was corrupt somewhere!
+    if (z->size[b] != s)
+        return -1; // was originally an assert, but report failure instead.
+    a->code_buffer >>= s;
+    a->num_bits -= s;
+    return z->value[b];
+}
+
+stbi_inline static int stbi__zhuffman_decode(stbi__zbuf * a, stbi__zhuffman * z) {
+    int b, s;
+    if (a->num_bits < 16) {
+        if (stbi__zeof(a)) {
+            return -1; /* report error for unexpected end of data. */
+        }
+        stbi__fill_bits(a);
+    }
+    b = z->fast[a->code_buffer & STBI__ZFAST_MASK];
+    if (b) {
+        s = b >> 9;
+        a->code_buffer >>= s;
+        a->num_bits -= s;
+        return b & 511;
+    }
+    return stbi__zhuffman_decode_slowpath(a, z);
+}
+
+static int stbi__zexpand(stbi__zbuf * z, char * zout, int n) // need to make room for n bytes
+{
+    char * q;
+    unsigned int cur, limit, old_limit;
+    z->zout = zout;
+    if (!z->z_expandable)
+        return stbi__err("output buffer limit", "Corrupt PNG");
+    cur = (unsigned int)(z->zout - z->zout_start);
+    limit = old_limit = (unsigned)(z->zout_end - z->zout_start);
+    if (UINT_MAX - cur < (unsigned)n)
+        return stbi__err("outofmem", "Out of memory");
+    while (cur + n > limit) {
+        if (limit > UINT_MAX / 2)
+            return stbi__err("outofmem", "Out of memory");
+        limit *= 2;
+    }
+    q = (char *)STBI_REALLOC_SIZED(z->zout_start, old_limit, limit);
+    STBI_NOTUSED(old_limit);
+    if (q == NULL)
+        return stbi__err("outofmem", "Out of memory");
+    z->zout_start = q;
+    z->zout = q + cur;
+    z->zout_end = q + limit;
+    return 1;
+}
+
+static const int stbi__zlength_base[31] = {3,  4,  5,  6,  7,  8,  9,  10,  11,  13,  15,  17,  19,  23, 27, 31,
+                                           35, 43, 51, 59, 67, 83, 99, 115, 131, 163, 195, 227, 258, 0,  0};
+
+static const int stbi__zlength_extra[31] = {0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2,
+                                            3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5, 0, 0, 0};
+
+static const int stbi__zdist_base[32] = {1,    2,    3,    4,    5,    7,     9,     13,    17,  25,   33,
+                                         49,   65,   97,   129,  193,  257,   385,   513,   769, 1025, 1537,
+                                         2049, 3073, 4097, 6145, 8193, 12289, 16385, 24577, 0,   0};
+
+static const int stbi__zdist_extra[32] = {0, 0, 0, 0, 1, 1, 2, 2,  3,  3,  4,  4,  5,  5,  6,
+                                          6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13};
+
+static int stbi__parse_huffman_block(stbi__zbuf * a) {
+    char * zout = a->zout;
+    for (;;) {
+        int z = stbi__zhuffman_decode(a, &a->z_length);
+        if (z < 256) {
+            if (z < 0)
+                return stbi__err("bad huffman code", "Corrupt PNG"); // error in huffman codes
+            if (zout >= a->zout_end) {
+                if (!stbi__zexpand(a, zout, 1))
+                    return 0;
+                zout = a->zout;
+            }
+            *zout++ = (char)z;
+        } else {
+            stbi_uc * p;
+            int len, dist;
+            if (z == 256) {
+                a->zout = zout;
+                return 1;
+            }
+            if (z >= 286)
+                return stbi__err("bad huffman code",
+                                 "Corrupt PNG"); // per DEFLATE, length codes 286 and 287 must not appear in compressed data
+            z -= 257;
+            len = stbi__zlength_base[z];
+            if (stbi__zlength_extra[z])
+                len += stbi__zreceive(a, stbi__zlength_extra[z]);
+            z = stbi__zhuffman_decode(a, &a->z_distance);
+            if (z < 0 || z >= 30)
+                return stbi__err("bad huffman code",
+                                 "Corrupt PNG"); // per DEFLATE, distance codes 30 and 31 must not appear in compressed data
+            dist = stbi__zdist_base[z];
+            if (stbi__zdist_extra[z])
+                dist += stbi__zreceive(a, stbi__zdist_extra[z]);
+            if (zout - a->zout_start < dist)
+                return stbi__err("bad dist", "Corrupt PNG");
+            if (zout + len > a->zout_end) {
+                if (!stbi__zexpand(a, zout, len))
+                    return 0;
+                zout = a->zout;
+            }
+            p = (stbi_uc *)(zout - dist);
+            if (dist == 1) { // run of one byte; common in images.
+                stbi_uc v = *p;
+                if (len) {
+                    do
+                        *zout++ = v;
+                    while (--len);
+                }
+            } else {
+                if (len) {
+                    do
+                        *zout++ = *p++;
+                    while (--len);
+                }
+            }
+        }
+    }
+}
+
+static int stbi__compute_huffman_codes(stbi__zbuf * a) {
+    static const stbi_uc length_dezigzag[19] = {16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15};
+    stbi__zhuffman z_codelength;
+    stbi_uc lencodes[286 + 32 + 137]; // padding for maximum single op
+    stbi_uc codelength_sizes[19];
+    int i, n;
+
+    int hlit = stbi__zreceive(a, 5) + 257;
+    int hdist = stbi__zreceive(a, 5) + 1;
+    int hclen = stbi__zreceive(a, 4) + 4;
+    int ntot = hlit + hdist;
+
+    memset(codelength_sizes, 0, sizeof(codelength_sizes));
+    for (i = 0; i < hclen; ++i) {
+        int s = stbi__zreceive(a, 3);
+        codelength_sizes[length_dezigzag[i]] = (stbi_uc)s;
+    }
+    if (!stbi__zbuild_huffman(&z_codelength, codelength_sizes, 19))
+        return 0;
+
+    n = 0;
+    while (n < ntot) {
+        int c = stbi__zhuffman_decode(a, &z_codelength);
+        if (c < 0 || c >= 19)
+            return stbi__err("bad codelengths", "Corrupt PNG");
+        if (c < 16)
+            lencodes[n++] = (stbi_uc)c;
+        else {
+            stbi_uc fill = 0;
+            if (c == 16) {
+                c = stbi__zreceive(a, 2) + 3;
+                if (n == 0)
+                    return stbi__err("bad codelengths", "Corrupt PNG");
+                fill = lencodes[n - 1];
+            } else if (c == 17) {
+                c = stbi__zreceive(a, 3) + 3;
+            } else if (c == 18) {
+                c = stbi__zreceive(a, 7) + 11;
+            } else {
+                return stbi__err("bad codelengths", "Corrupt PNG");
+            }
+            if (ntot - n < c)
+                return stbi__err("bad codelengths", "Corrupt PNG");
+            memset(lencodes + n, fill, c);
+            n += c;
+        }
+    }
+    if (n != ntot)
+        return stbi__err("bad codelengths", "Corrupt PNG");
+    if (!stbi__zbuild_huffman(&a->z_length, lencodes, hlit))
+        return 0;
+    if (!stbi__zbuild_huffman(&a->z_distance, lencodes + hlit, hdist))
+        return 0;
+    return 1;
+}
+
+static int stbi__parse_uncompressed_block(stbi__zbuf * a) {
+    stbi_uc header[4];
+    int len, nlen, k;
+    if (a->num_bits & 7)
+        stbi__zreceive(a, a->num_bits & 7); // discard
+    // drain the bit-packed data into header
+    k = 0;
+    while (a->num_bits > 0) {
+        header[k++] = (stbi_uc)(a->code_buffer & 255); // suppress MSVC run-time check
+        a->code_buffer >>= 8;
+        a->num_bits -= 8;
+    }
+    if (a->num_bits < 0)
+        return stbi__err("zlib corrupt", "Corrupt PNG");
+    // now fill header the normal way
+    while (k < 4)
+        header[k++] = stbi__zget8(a);
+    len = header[1] * 256 + header[0];
+    nlen = header[3] * 256 + header[2];
+    if (nlen != (len ^ 0xffff))
+        return stbi__err("zlib corrupt", "Corrupt PNG");
+    if (a->zbuffer + len > a->zbuffer_end)
+        return stbi__err("read past buffer", "Corrupt PNG");
+    if (a->zout + len > a->zout_end)
+        if (!stbi__zexpand(a, a->zout, len))
+            return 0;
+    memcpy(a->zout, a->zbuffer, len);
+    a->zbuffer += len;
+    a->zout += len;
+    return 1;
+}
+
+static int stbi__parse_zlib_header(stbi__zbuf * a) {
+    int cmf = stbi__zget8(a);
+    int cm = cmf & 15;
+    /* int cinfo = cmf >> 4; */
+    int flg = stbi__zget8(a);
+    if (stbi__zeof(a))
+        return stbi__err("bad zlib header", "Corrupt PNG"); // zlib spec
+    if ((cmf * 256 + flg) % 31 != 0)
+        return stbi__err("bad zlib header", "Corrupt PNG"); // zlib spec
+    if (flg & 32)
+        return stbi__err("no preset dict", "Corrupt PNG"); // preset dictionary not allowed in png
+    if (cm != 8)
+        return stbi__err("bad compression", "Corrupt PNG"); // DEFLATE required for png
+    // window = 1 << (8 + cinfo)... but who cares, we fully buffer output
+    return 1;
+}
+
+static const stbi_uc stbi__zdefault_length[STBI__ZNSYMS] = {
+    8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+    8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+    8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+    8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+    9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+    9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+    9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+    9, 9, 9, 9, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8, 8, 8, 8, 8};
+static const stbi_uc stbi__zdefault_distance[32] = {5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+                                                    5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5};
+/*
+Init algorithm:
+{
+   int i;   // use <= to match clearly with spec
+   for (i=0; i <= 143; ++i)     stbi__zdefault_length[i]   = 8;
+   for (   ; i <= 255; ++i)     stbi__zdefault_length[i]   = 9;
+   for (   ; i <= 279; ++i)     stbi__zdefault_length[i]   = 7;
+   for (   ; i <= 287; ++i)     stbi__zdefault_length[i]   = 8;
+
+   for (i=0; i <=  31; ++i)     stbi__zdefault_distance[i] = 5;
+}
+*/
+
+static int stbi__parse_zlib(stbi__zbuf * a, int parse_header) {
+    int final, type;
+    if (parse_header)
+        if (!stbi__parse_zlib_header(a))
+            return 0;
+    a->num_bits = 0;
+    a->code_buffer = 0;
+    do {
+        final = stbi__zreceive(a, 1);
+        type = stbi__zreceive(a, 2);
+        if (type == 0) {
+            if (!stbi__parse_uncompressed_block(a))
+                return 0;
+        } else if (type == 3) {
+            return 0;
+        } else {
+            if (type == 1) {
+                // use fixed code lengths
+                if (!stbi__zbuild_huffman(&a->z_length, stbi__zdefault_length, STBI__ZNSYMS))
+                    return 0;
+                if (!stbi__zbuild_huffman(&a->z_distance, stbi__zdefault_distance, 32))
+                    return 0;
+            } else {
+                if (!stbi__compute_huffman_codes(a))
+                    return 0;
+            }
+            if (!stbi__parse_huffman_block(a))
+                return 0;
+        }
+    } while (!final);
+    return 1;
+}
+
+static int stbi__do_zlib(stbi__zbuf * a, char * obuf, int olen, int exp, int parse_header) {
+    a->zout_start = obuf;
+    a->zout = obuf;
+    a->zout_end = obuf + olen;
+    a->z_expandable = exp;
+
+    return stbi__parse_zlib(a, parse_header);
+}
+
+STBIDEF char * stbi_zlib_decode_malloc_guesssize(const char * buffer, int len, int initial_size, int * outlen) {
+    stbi__zbuf a;
+    char * p = (char *)stbi__malloc(initial_size);
+    if (p == NULL)
+        return NULL;
+    a.zbuffer = (stbi_uc *)buffer;
+    a.zbuffer_end = (stbi_uc *)buffer + len;
+    if (stbi__do_zlib(&a, p, initial_size, 1, 1)) {
+        if (outlen)
+            *outlen = (int)(a.zout - a.zout_start);
+        return a.zout_start;
+    } else {
+        STBI_FREE(a.zout_start);
+        return NULL;
+    }
+}
+
+STBIDEF char * stbi_zlib_decode_malloc(char const * buffer, int len, int * outlen) {
+    return stbi_zlib_decode_malloc_guesssize(buffer, len, 16384, outlen);
+}
+
+STBIDEF char * stbi_zlib_decode_malloc_guesssize_headerflag(const char * buffer, int len, int initial_size, int * outlen,
+                                                            int parse_header) {
+    stbi__zbuf a;
+    char * p = (char *)stbi__malloc(initial_size);
+    if (p == NULL)
+        return NULL;
+    a.zbuffer = (stbi_uc *)buffer;
+    a.zbuffer_end = (stbi_uc *)buffer + len;
+    if (stbi__do_zlib(&a, p, initial_size, 1, parse_header)) {
+        if (outlen)
+            *outlen = (int)(a.zout - a.zout_start);
+        return a.zout_start;
+    } else {
+        STBI_FREE(a.zout_start);
+        return NULL;
+    }
+}
+
+STBIDEF int stbi_zlib_decode_buffer(char * obuffer, int olen, char const * ibuffer, int ilen) {
+    stbi__zbuf a;
+    a.zbuffer = (stbi_uc *)ibuffer;
+    a.zbuffer_end = (stbi_uc *)ibuffer + ilen;
+    if (stbi__do_zlib(&a, obuffer, olen, 0, 1))
+        return (int)(a.zout - a.zout_start);
+    else
+        return -1;
+}
+
+STBIDEF char * stbi_zlib_decode_noheader_malloc(char const * buffer, int len, int * outlen) {
+    stbi__zbuf a;
+    char * p = (char *)stbi__malloc(16384);
+    if (p == NULL)
+        return NULL;
+    a.zbuffer = (stbi_uc *)buffer;
+    a.zbuffer_end = (stbi_uc *)buffer + len;
+    if (stbi__do_zlib(&a, p, 16384, 1, 0)) {
+        if (outlen)
+            *outlen = (int)(a.zout - a.zout_start);
+        return a.zout_start;
+    } else {
+        STBI_FREE(a.zout_start);
+        return NULL;
+    }
+}
+
+STBIDEF int stbi_zlib_decode_noheader_buffer(char * obuffer, int olen, const char * ibuffer, int ilen) {
+    stbi__zbuf a;
+    a.zbuffer = (stbi_uc *)ibuffer;
+    a.zbuffer_end = (stbi_uc *)ibuffer + ilen;
+    if (stbi__do_zlib(&a, obuffer, olen, 0, 0))
+        return (int)(a.zout - a.zout_start);
+    else
+        return -1;
+}
+#endif
+
+// public domain "baseline" PNG decoder   v0.10  Sean Barrett 2006-11-18
+//    simple implementation
+//      - only 8-bit samples
+//      - no CRC checking
+//      - allocates lots of intermediate memory
+//        - avoids problem of streaming data between subsystems
+//        - avoids explicit window management
+//    performance
+//      - uses stb_zlib, a PD zlib implementation with fast huffman decoding
+
+#ifndef STBI_NO_PNG
+typedef struct {
+    stbi__uint32 length;
+    stbi__uint32 type;
+} stbi__pngchunk;
+
+static stbi__pngchunk stbi__get_chunk_header(stbi__context * s) {
+    stbi__pngchunk c;
+    c.length = stbi__get32be(s);
+    c.type = stbi__get32be(s);
+    return c;
+}
+
+static int stbi__check_png_header(stbi__context * s) {
+    static const stbi_uc png_sig[8] = {137, 80, 78, 71, 13, 10, 26, 10};
+    int i;
+    for (i = 0; i < 8; ++i)
+        if (stbi__get8(s) != png_sig[i])
+            return stbi__err("bad png sig", "Not a PNG");
+    return 1;
+}
+
+typedef struct {
+    stbi__context * s;
+    stbi_uc *idata, *expanded, *out;
+    int depth;
+} stbi__png;
+
+enum {
+    STBI__F_none = 0,
+    STBI__F_sub = 1,
+    STBI__F_up = 2,
+    STBI__F_avg = 3,
+    STBI__F_paeth = 4,
+    // synthetic filters used for first scanline to avoid needing a dummy row of 0s
+    STBI__F_avg_first,
+    STBI__F_paeth_first
+};
+
+static stbi_uc first_row_filter[5] = {STBI__F_none, STBI__F_sub, STBI__F_none, STBI__F_avg_first, STBI__F_paeth_first};
+
+static int stbi__paeth(int a, int b, int c) {
+    int p = a + b - c;
+    int pa = abs(p - a);
+    int pb = abs(p - b);
+    int pc = abs(p - c);
+    if (pa <= pb && pa <= pc)
+        return a;
+    if (pb <= pc)
+        return b;
+    return c;
+}
+
+static const stbi_uc stbi__depth_scale_table[9] = {0, 0xff, 0x55, 0, 0x11, 0, 0, 0, 0x01};
+
+// create the png data from post-deflated data
+static int stbi__create_png_image_raw(stbi__png * a, stbi_uc * raw, stbi__uint32 raw_len, int out_n, stbi__uint32 x,
+                                      stbi__uint32 y, int depth, int color) {
+    int bytes = (depth == 16 ? 2 : 1);
+    stbi__context * s = a->s;
+    stbi__uint32 i, j, stride = x * out_n * bytes;
+    stbi__uint32 img_len, img_width_bytes;
+    int k;
+    int img_n = s->img_n; // copy it into a local for later
+
+    int output_bytes = out_n * bytes;
+    int filter_bytes = img_n * bytes;
+    int width = x;
+
+    STBI_ASSERT(out_n == s->img_n || out_n == s->img_n + 1);
+    a->out = (stbi_uc *)stbi__malloc_mad3(x, y, output_bytes, 0); // extra bytes to write off the end into
+    if (!a->out)
+        return stbi__err("outofmem", "Out of memory");
+
+    if (!stbi__mad3sizes_valid(img_n, x, depth, 7))
+        return stbi__err("too large", "Corrupt PNG");
+    img_width_bytes = (((img_n * x * depth) + 7) >> 3);
+    img_len = (img_width_bytes + 1) * y;
+
+    // we used to check for exact match between raw_len and img_len on non-interlaced PNGs,
+    // but issue #276 reported a PNG in the wild that had extra data at the end (all zeros),
+    // so just check for raw_len < img_len always.
+    if (raw_len < img_len)
+        return stbi__err("not enough pixels", "Corrupt PNG");
+
+    for (j = 0; j < y; ++j) {
+        stbi_uc * cur = a->out + stride * j;
+        stbi_uc * prior;
+        int filter = *raw++;
+
+        if (filter > 4)
+            return stbi__err("invalid filter", "Corrupt PNG");
+
+        if (depth < 8) {
+            if (img_width_bytes > x)
+                return stbi__err("invalid width", "Corrupt PNG");
+            cur += x * out_n - img_width_bytes; // store output to the rightmost img_len bytes, so we can decode in place
+            filter_bytes = 1;
+            width = img_width_bytes;
+        }
+        prior = cur - stride; // bugfix: need to compute this after 'cur +=' computation above
+
+        // if first row, use special filter that doesn't sample previous row
+        if (j == 0)
+            filter = first_row_filter[filter];
+
+        // handle first byte explicitly
+        for (k = 0; k < filter_bytes; ++k) {
+            switch (filter) {
+            case STBI__F_none:
+                cur[k] = raw[k];
+                break;
+            case STBI__F_sub:
+                cur[k] = raw[k];
+                break;
+            case STBI__F_up:
+                cur[k] = STBI__BYTECAST(raw[k] + prior[k]);
+                break;
+            case STBI__F_avg:
+                cur[k] = STBI__BYTECAST(raw[k] + (prior[k] >> 1));
+                break;
+            case STBI__F_paeth:
+                cur[k] = STBI__BYTECAST(raw[k] + stbi__paeth(0, prior[k], 0));
+                break;
+            case STBI__F_avg_first:
+                cur[k] = raw[k];
+                break;
+            case STBI__F_paeth_first:
+                cur[k] = raw[k];
+                break;
+            }
+        }
+
+        if (depth == 8) {
+            if (img_n != out_n)
+                cur[img_n] = 255; // first pixel
+            raw += img_n;
+            cur += out_n;
+            prior += out_n;
+        } else if (depth == 16) {
+            if (img_n != out_n) {
+                cur[filter_bytes] = 255;     // first pixel top byte
+                cur[filter_bytes + 1] = 255; // first pixel bottom byte
+            }
+            raw += filter_bytes;
+            cur += output_bytes;
+            prior += output_bytes;
+        } else {
+            raw += 1;
+            cur += 1;
+            prior += 1;
+        }
+
+        // this is a little gross, so that we don't switch per-pixel or per-component
+        if (depth < 8 || img_n == out_n) {
+            int nk = (width - 1) * filter_bytes;
+#define STBI__CASE(f)                                                                                                          \
+    case f:                                                                                                                    \
+        for (k = 0; k < nk; ++k)
+            switch (filter) {
+            // "none" filter turns into a memcpy here; make that explicit.
+            case STBI__F_none:
+                memcpy(cur, raw, nk);
+                break;
+                STBI__CASE(STBI__F_sub) { cur[k] = STBI__BYTECAST(raw[k] + cur[k - filter_bytes]); }
+                break;
+                STBI__CASE(STBI__F_up) { cur[k] = STBI__BYTECAST(raw[k] + prior[k]); }
+                break;
+                STBI__CASE(STBI__F_avg) { cur[k] = STBI__BYTECAST(raw[k] + ((prior[k] + cur[k - filter_bytes]) >> 1)); }
+                break;
+                STBI__CASE(STBI__F_paeth) {
+                    cur[k] = STBI__BYTECAST(raw[k] + stbi__paeth(cur[k - filter_bytes], prior[k], prior[k - filter_bytes]));
+                }
+                break;
+                STBI__CASE(STBI__F_avg_first) { cur[k] = STBI__BYTECAST(raw[k] + (cur[k - filter_bytes] >> 1)); }
+                break;
+                STBI__CASE(STBI__F_paeth_first) { cur[k] = STBI__BYTECAST(raw[k] + stbi__paeth(cur[k - filter_bytes], 0, 0)); }
+                break;
+            }
+#undef STBI__CASE
+            raw += nk;
+        } else {
+            STBI_ASSERT(img_n + 1 == out_n);
+#define STBI__CASE(f)                                                                                                          \
+    case f:                                                                                                                    \
+        for (i = x - 1; i >= 1; --i, cur[filter_bytes] = 255, raw += filter_bytes, cur += output_bytes, prior += output_bytes) \
+            for (k = 0; k < filter_bytes; ++k)
+            switch (filter) {
+                STBI__CASE(STBI__F_none) { cur[k] = raw[k]; }
+                break;
+                STBI__CASE(STBI__F_sub) { cur[k] = STBI__BYTECAST(raw[k] + cur[k - output_bytes]); }
+                break;
+                STBI__CASE(STBI__F_up) { cur[k] = STBI__BYTECAST(raw[k] + prior[k]); }
+                break;
+                STBI__CASE(STBI__F_avg) { cur[k] = STBI__BYTECAST(raw[k] + ((prior[k] + cur[k - output_bytes]) >> 1)); }
+                break;
+                STBI__CASE(STBI__F_paeth) {
+                    cur[k] = STBI__BYTECAST(raw[k] + stbi__paeth(cur[k - output_bytes], prior[k], prior[k - output_bytes]));
+                }
+                break;
+                STBI__CASE(STBI__F_avg_first) { cur[k] = STBI__BYTECAST(raw[k] + (cur[k - output_bytes] >> 1)); }
+                break;
+                STBI__CASE(STBI__F_paeth_first) { cur[k] = STBI__BYTECAST(raw[k] + stbi__paeth(cur[k - output_bytes], 0, 0)); }
+                break;
+            }
+#undef STBI__CASE
+
+            // the loop above sets the high byte of the pixels' alpha, but for
+            // 16 bit png files we also need the low byte set. we'll do that here.
+            if (depth == 16) {
+                cur = a->out + stride * j; // start at the beginning of the row again
+                for (i = 0; i < x; ++i, cur += output_bytes) {
+                    cur[filter_bytes + 1] = 255;
+                }
+            }
+        }
+    }
+
+    // we make a separate pass to expand bits to pixels; for performance,
+    // this could run two scanlines behind the above code, so it won't
+    // intefere with filtering but will still be in the cache.
+    if (depth < 8) {
+        for (j = 0; j < y; ++j) {
+            stbi_uc * cur = a->out + stride * j;
+            stbi_uc * in = a->out + stride * j + x * out_n - img_width_bytes;
+            // unpack 1/2/4-bit into a 8-bit buffer. allows us to keep the common 8-bit path optimal at minimal cost for
+            // 1/2/4-bit png guarante byte alignment, if width is not multiple of 8/4/2 we'll decode dummy trailing data that
+            // will be skipped in the later loop
+            stbi_uc scale = (color == 0) ? stbi__depth_scale_table[depth] : 1; // scale grayscale values to 0..255 range
+
+            // note that the final byte might overshoot and write more data than desired.
+            // we can allocate enough data that this never writes out of memory, but it
+            // could also overwrite the next scanline. can it overwrite non-empty data
+            // on the next scanline? yes, consider 1-pixel-wide scanlines with 1-bit-per-pixel.
+            // so we need to explicitly clamp the final ones
+
+            if (depth == 4) {
+                for (k = x * img_n; k >= 2; k -= 2, ++in) {
+                    *cur++ = scale * ((*in >> 4));
+                    *cur++ = scale * ((*in) & 0x0f);
+                }
+                if (k > 0)
+                    *cur++ = scale * ((*in >> 4));
+            } else if (depth == 2) {
+                for (k = x * img_n; k >= 4; k -= 4, ++in) {
+                    *cur++ = scale * ((*in >> 6));
+                    *cur++ = scale * ((*in >> 4) & 0x03);
+                    *cur++ = scale * ((*in >> 2) & 0x03);
+                    *cur++ = scale * ((*in) & 0x03);
+                }
+                if (k > 0)
+                    *cur++ = scale * ((*in >> 6));
+                if (k > 1)
+                    *cur++ = scale * ((*in >> 4) & 0x03);
+                if (k > 2)
+                    *cur++ = scale * ((*in >> 2) & 0x03);
+            } else if (depth == 1) {
+                for (k = x * img_n; k >= 8; k -= 8, ++in) {
+                    *cur++ = scale * ((*in >> 7));
+                    *cur++ = scale * ((*in >> 6) & 0x01);
+                    *cur++ = scale * ((*in >> 5) & 0x01);
+                    *cur++ = scale * ((*in >> 4) & 0x01);
+                    *cur++ = scale * ((*in >> 3) & 0x01);
+                    *cur++ = scale * ((*in >> 2) & 0x01);
+                    *cur++ = scale * ((*in >> 1) & 0x01);
+                    *cur++ = scale * ((*in) & 0x01);
+                }
+                if (k > 0)
+                    *cur++ = scale * ((*in >> 7));
+                if (k > 1)
+                    *cur++ = scale * ((*in >> 6) & 0x01);
+                if (k > 2)
+                    *cur++ = scale * ((*in >> 5) & 0x01);
+                if (k > 3)
+                    *cur++ = scale * ((*in >> 4) & 0x01);
+                if (k > 4)
+                    *cur++ = scale * ((*in >> 3) & 0x01);
+                if (k > 5)
+                    *cur++ = scale * ((*in >> 2) & 0x01);
+                if (k > 6)
+                    *cur++ = scale * ((*in >> 1) & 0x01);
+            }
+            if (img_n != out_n) {
+                int q;
+                // insert alpha = 255
+                cur = a->out + stride * j;
+                if (img_n == 1) {
+                    for (q = x - 1; q >= 0; --q) {
+                        cur[q * 2 + 1] = 255;
+                        cur[q * 2 + 0] = cur[q];
+                    }
+                } else {
+                    STBI_ASSERT(img_n == 3);
+                    for (q = x - 1; q >= 0; --q) {
+                        cur[q * 4 + 3] = 255;
+                        cur[q * 4 + 2] = cur[q * 3 + 2];
+                        cur[q * 4 + 1] = cur[q * 3 + 1];
+                        cur[q * 4 + 0] = cur[q * 3 + 0];
+                    }
+                }
+            }
+        }
+    } else if (depth == 16) {
+        // force the image data from big-endian to platform-native.
+        // this is done in a separate pass due to the decoding relying
+        // on the data being untouched, but could probably be done
+        // per-line during decode if care is taken.
+        stbi_uc * cur = a->out;
+        stbi__uint16 * cur16 = (stbi__uint16 *)cur;
+
+        for (i = 0; i < x * y * out_n; ++i, cur16++, cur += 2) {
+            *cur16 = (cur[0] << 8) | cur[1];
+        }
+    }
+
+    return 1;
+}
+
+static int stbi__create_png_image(stbi__png * a, stbi_uc * image_data, stbi__uint32 image_data_len, int out_n, int depth,
+                                  int color, int interlaced) {
+    int bytes = (depth == 16 ? 2 : 1);
+    int out_bytes = out_n * bytes;
+    stbi_uc * final;
+    int p;
+    if (!interlaced)
+        return stbi__create_png_image_raw(a, image_data, image_data_len, out_n, a->s->img_x, a->s->img_y, depth, color);
+
+    // de-interlacing
+    final = (stbi_uc *)stbi__malloc_mad3(a->s->img_x, a->s->img_y, out_bytes, 0);
+    if (!final)
+        return stbi__err("outofmem", "Out of memory");
+    for (p = 0; p < 7; ++p) {
+        int xorig[] = {0, 4, 0, 2, 0, 1, 0};
+        int yorig[] = {0, 0, 4, 0, 2, 0, 1};
+        int xspc[] = {8, 8, 4, 4, 2, 2, 1};
+        int yspc[] = {8, 8, 8, 4, 4, 2, 2};
+        int i, j, x, y;
+        // pass1_x[4] = 0, pass1_x[5] = 1, pass1_x[12] = 1
+        x = (a->s->img_x - xorig[p] + xspc[p] - 1) / xspc[p];
+        y = (a->s->img_y - yorig[p] + yspc[p] - 1) / yspc[p];
+        if (x && y) {
+            stbi__uint32 img_len = ((((a->s->img_n * x * depth) + 7) >> 3) + 1) * y;
+            if (!stbi__create_png_image_raw(a, image_data, image_data_len, out_n, x, y, depth, color)) {
+                STBI_FREE(final);
+                return 0;
+            }
+            for (j = 0; j < y; ++j) {
+                for (i = 0; i < x; ++i) {
+                    int out_y = j * yspc[p] + yorig[p];
+                    int out_x = i * xspc[p] + xorig[p];
+                    memcpy(final + out_y * a->s->img_x * out_bytes + out_x * out_bytes, a->out + (j * x + i) * out_bytes,
+                           out_bytes);
+                }
+            }
+            STBI_FREE(a->out);
+            image_data += img_len;
+            image_data_len -= img_len;
+        }
+    }
+    a->out = final;
+
+    return 1;
+}
+
+static int stbi__compute_transparency(stbi__png * z, stbi_uc tc[3], int out_n) {
+    stbi__context * s = z->s;
+    stbi__uint32 i, pixel_count = s->img_x * s->img_y;
+    stbi_uc * p = z->out;
+
+    // compute color-based transparency, assuming we've
+    // already got 255 as the alpha value in the output
+    STBI_ASSERT(out_n == 2 || out_n == 4);
+
+    if (out_n == 2) {
+        for (i = 0; i < pixel_count; ++i) {
+            p[1] = (p[0] == tc[0] ? 0 : 255);
+            p += 2;
+        }
+    } else {
+        for (i = 0; i < pixel_count; ++i) {
+            if (p[0] == tc[0] && p[1] == tc[1] && p[2] == tc[2])
+                p[3] = 0;
+            p += 4;
+        }
+    }
+    return 1;
+}
+
+static int stbi__compute_transparency16(stbi__png * z, stbi__uint16 tc[3], int out_n) {
+    stbi__context * s = z->s;
+    stbi__uint32 i, pixel_count = s->img_x * s->img_y;
+    stbi__uint16 * p = (stbi__uint16 *)z->out;
+
+    // compute color-based transparency, assuming we've
+    // already got 65535 as the alpha value in the output
+    STBI_ASSERT(out_n == 2 || out_n == 4);
+
+    if (out_n == 2) {
+        for (i = 0; i < pixel_count; ++i) {
+            p[1] = (p[0] == tc[0] ? 0 : 65535);
+            p += 2;
+        }
+    } else {
+        for (i = 0; i < pixel_count; ++i) {
+            if (p[0] == tc[0] && p[1] == tc[1] && p[2] == tc[2])
+                p[3] = 0;
+            p += 4;
+        }
+    }
+    return 1;
+}
+
+static int stbi__expand_png_palette(stbi__png * a, stbi_uc * palette, int len, int pal_img_n) {
+    stbi__uint32 i, pixel_count = a->s->img_x * a->s->img_y;
+    stbi_uc *p, *temp_out, *orig = a->out;
+
+    p = (stbi_uc *)stbi__malloc_mad2(pixel_count, pal_img_n, 0);
+    if (p == NULL)
+        return stbi__err("outofmem", "Out of memory");
+
+    // between here and free(out) below, exitting would leak
+    temp_out = p;
+
+    if (pal_img_n == 3) {
+        for (i = 0; i < pixel_count; ++i) {
+            int n = orig[i] * 4;
+            p[0] = palette[n];
+            p[1] = palette[n + 1];
+            p[2] = palette[n + 2];
+            p += 3;
+        }
+    } else {
+        for (i = 0; i < pixel_count; ++i) {
+            int n = orig[i] * 4;
+            p[0] = palette[n];
+            p[1] = palette[n + 1];
+            p[2] = palette[n + 2];
+            p[3] = palette[n + 3];
+            p += 4;
+        }
+    }
+    STBI_FREE(a->out);
+    a->out = temp_out;
+
+    STBI_NOTUSED(len);
+
+    return 1;
+}
+
+static int stbi__unpremultiply_on_load_global = 0;
+static int stbi__de_iphone_flag_global = 0;
+
+STBIDEF void stbi_set_unpremultiply_on_load(int flag_true_if_should_unpremultiply) {
+    stbi__unpremultiply_on_load_global = flag_true_if_should_unpremultiply;
+}
+
+STBIDEF void stbi_convert_iphone_png_to_rgb(int flag_true_if_should_convert) {
+    stbi__de_iphone_flag_global = flag_true_if_should_convert;
+}
+
+#ifndef STBI_THREAD_LOCAL
+#define stbi__unpremultiply_on_load stbi__unpremultiply_on_load_global
+#define stbi__de_iphone_flag stbi__de_iphone_flag_global
+#else
+static STBI_THREAD_LOCAL int stbi__unpremultiply_on_load_local, stbi__unpremultiply_on_load_set;
+static STBI_THREAD_LOCAL int stbi__de_iphone_flag_local, stbi__de_iphone_flag_set;
+
+STBIDEF void stbi_set_unpremultiply_on_load_thread(int flag_true_if_should_unpremultiply) {
+    stbi__unpremultiply_on_load_local = flag_true_if_should_unpremultiply;
+    stbi__unpremultiply_on_load_set = 1;
+}
+
+STBIDEF void stbi_convert_iphone_png_to_rgb_thread(int flag_true_if_should_convert) {
+    stbi__de_iphone_flag_local = flag_true_if_should_convert;
+    stbi__de_iphone_flag_set = 1;
+}
+
+#define stbi__unpremultiply_on_load                                                                                            \
+    (stbi__unpremultiply_on_load_set ? stbi__unpremultiply_on_load_local : stbi__unpremultiply_on_load_global)
+#define stbi__de_iphone_flag (stbi__de_iphone_flag_set ? stbi__de_iphone_flag_local : stbi__de_iphone_flag_global)
+#endif // STBI_THREAD_LOCAL
+
+static void stbi__de_iphone(stbi__png * z) {
+    stbi__context * s = z->s;
+    stbi__uint32 i, pixel_count = s->img_x * s->img_y;
+    stbi_uc * p = z->out;
+
+    if (s->img_out_n == 3) { // convert bgr to rgb
+        for (i = 0; i < pixel_count; ++i) {
+            stbi_uc t = p[0];
+            p[0] = p[2];
+            p[2] = t;
+            p += 3;
+        }
+    } else {
+        STBI_ASSERT(s->img_out_n == 4);
+        if (stbi__unpremultiply_on_load) {
+            // convert bgr to rgb and unpremultiply
+            for (i = 0; i < pixel_count; ++i) {
+                stbi_uc a = p[3];
+                stbi_uc t = p[0];
+                if (a) {
+                    stbi_uc half = a / 2;
+                    p[0] = (p[2] * 255 + half) / a;
+                    p[1] = (p[1] * 255 + half) / a;
+                    p[2] = (t * 255 + half) / a;
+                } else {
+                    p[0] = p[2];
+                    p[2] = t;
+                }
+                p += 4;
+            }
+        } else {
+            // convert bgr to rgb
+            for (i = 0; i < pixel_count; ++i) {
+                stbi_uc t = p[0];
+                p[0] = p[2];
+                p[2] = t;
+                p += 4;
+            }
+        }
+    }
+}
+
+#define STBI__PNG_TYPE(a, b, c, d) (((unsigned)(a) << 24) + ((unsigned)(b) << 16) + ((unsigned)(c) << 8) + (unsigned)(d))
+
+static int stbi__parse_png_file(stbi__png * z, int scan, int req_comp) {
+    stbi_uc palette[1024], pal_img_n = 0;
+    stbi_uc has_trans = 0, tc[3] = {0};
+    stbi__uint16 tc16[3];
+    stbi__uint32 ioff = 0, idata_limit = 0, i, pal_len = 0;
+    int first = 1, k, interlace = 0, color = 0, is_iphone = 0;
+    stbi__context * s = z->s;
+
+    z->expanded = NULL;
+    z->idata = NULL;
+    z->out = NULL;
+
+    if (!stbi__check_png_header(s))
+        return 0;
+
+    if (scan == STBI__SCAN_type)
+        return 1;
+
+    for (;;) {
+        stbi__pngchunk c = stbi__get_chunk_header(s);
+        switch (c.type) {
+        case STBI__PNG_TYPE('C', 'g', 'B', 'I'):
+            is_iphone = 1;
+            stbi__skip(s, c.length);
+            break;
+        case STBI__PNG_TYPE('I', 'H', 'D', 'R'): {
+            int comp, filter;
+            if (!first)
+                return stbi__err("multiple IHDR", "Corrupt PNG");
+            first = 0;
+            if (c.length != 13)
+                return stbi__err("bad IHDR len", "Corrupt PNG");
+            s->img_x = stbi__get32be(s);
+            s->img_y = stbi__get32be(s);
+            if (s->img_y > STBI_MAX_DIMENSIONS)
+                return stbi__err("too large", "Very large image (corrupt?)");
+            if (s->img_x > STBI_MAX_DIMENSIONS)
+                return stbi__err("too large", "Very large image (corrupt?)");
+            z->depth = stbi__get8(s);
+            if (z->depth != 1 && z->depth != 2 && z->depth != 4 && z->depth != 8 && z->depth != 16)
+                return stbi__err("1/2/4/8/16-bit only", "PNG not supported: 1/2/4/8/16-bit only");
+            color = stbi__get8(s);
+            if (color > 6)
+                return stbi__err("bad ctype", "Corrupt PNG");
+            if (color == 3 && z->depth == 16)
+                return stbi__err("bad ctype", "Corrupt PNG");
+            if (color == 3)
+                pal_img_n = 3;
+            else if (color & 1)
+                return stbi__err("bad ctype", "Corrupt PNG");
+            comp = stbi__get8(s);
+            if (comp)
+                return stbi__err("bad comp method", "Corrupt PNG");
+            filter = stbi__get8(s);
+            if (filter)
+                return stbi__err("bad filter method", "Corrupt PNG");
+            interlace = stbi__get8(s);
+            if (interlace > 1)
+                return stbi__err("bad interlace method", "Corrupt PNG");
+            if (!s->img_x || !s->img_y)
+                return stbi__err("0-pixel image", "Corrupt PNG");
+            if (!pal_img_n) {
+                s->img_n = (color & 2 ? 3 : 1) + (color & 4 ? 1 : 0);
+                if ((1 << 30) / s->img_x / s->img_n < s->img_y)
+                    return stbi__err("too large", "Image too large to decode");
+            } else {
+                // if paletted, then pal_n is our final components, and
+                // img_n is # components to decompress/filter.
+                s->img_n = 1;
+                if ((1 << 30) / s->img_x / 4 < s->img_y)
+                    return stbi__err("too large", "Corrupt PNG");
+            }
+            // even with SCAN_header, have to scan to see if we have a tRNS
+            break;
+        }
+
+        case STBI__PNG_TYPE('P', 'L', 'T', 'E'): {
+            if (first)
+                return stbi__err("first not IHDR", "Corrupt PNG");
+            if (c.length > 256 * 3)
+                return stbi__err("invalid PLTE", "Corrupt PNG");
+            pal_len = c.length / 3;
+            if (pal_len * 3 != c.length)
+                return stbi__err("invalid PLTE", "Corrupt PNG");
+            for (i = 0; i < pal_len; ++i) {
+                palette[i * 4 + 0] = stbi__get8(s);
+                palette[i * 4 + 1] = stbi__get8(s);
+                palette[i * 4 + 2] = stbi__get8(s);
+                palette[i * 4 + 3] = 255;
+            }
+            break;
+        }
+
+        case STBI__PNG_TYPE('t', 'R', 'N', 'S'): {
+            if (first)
+                return stbi__err("first not IHDR", "Corrupt PNG");
+            if (z->idata)
+                return stbi__err("tRNS after IDAT", "Corrupt PNG");
+            if (pal_img_n) {
+                if (scan == STBI__SCAN_header) {
+                    s->img_n = 4;
+                    return 1;
+                }
+                if (pal_len == 0)
+                    return stbi__err("tRNS before PLTE", "Corrupt PNG");
+                if (c.length > pal_len)
+                    return stbi__err("bad tRNS len", "Corrupt PNG");
+                pal_img_n = 4;
+                for (i = 0; i < c.length; ++i)
+                    palette[i * 4 + 3] = stbi__get8(s);
+            } else {
+                if (!(s->img_n & 1))
+                    return stbi__err("tRNS with alpha", "Corrupt PNG");
+                if (c.length != (stbi__uint32)s->img_n * 2)
+                    return stbi__err("bad tRNS len", "Corrupt PNG");
+                has_trans = 1;
+                // non-paletted with tRNS = constant alpha. if header-scanning, we can stop now.
+                if (scan == STBI__SCAN_header) {
+                    ++s->img_n;
+                    return 1;
+                }
+                if (z->depth == 16) {
+                    for (k = 0; k < s->img_n; ++k)
+                        tc16[k] = (stbi__uint16)stbi__get16be(s); // copy the values as-is
+                } else {
+                    for (k = 0; k < s->img_n; ++k)
+                        tc[k] = (stbi_uc)(stbi__get16be(s) & 255) *
+                                stbi__depth_scale_table[z->depth]; // non 8-bit images will be larger
+                }
+            }
+            break;
+        }
+
+        case STBI__PNG_TYPE('I', 'D', 'A', 'T'): {
+            if (first)
+                return stbi__err("first not IHDR", "Corrupt PNG");
+            if (pal_img_n && !pal_len)
+                return stbi__err("no PLTE", "Corrupt PNG");
+            if (scan == STBI__SCAN_header) {
+                // header scan definitely stops at first IDAT
+                if (pal_img_n)
+                    s->img_n = pal_img_n;
+                return 1;
+            }
+            if (c.length > (1u << 30))
+                return stbi__err("IDAT size limit", "IDAT section larger than 2^30 bytes");
+            if ((int)(ioff + c.length) < (int)ioff)
+                return 0;
+            if (ioff + c.length > idata_limit) {
+                stbi__uint32 idata_limit_old = idata_limit;
+                stbi_uc * p;
+                if (idata_limit == 0)
+                    idata_limit = c.length > 4096 ? c.length : 4096;
+                while (ioff + c.length > idata_limit)
+                    idata_limit *= 2;
+                STBI_NOTUSED(idata_limit_old);
+                p = (stbi_uc *)STBI_REALLOC_SIZED(z->idata, idata_limit_old, idata_limit);
+                if (p == NULL)
+                    return stbi__err("outofmem", "Out of memory");
+                z->idata = p;
+            }
+            if (!stbi__getn(s, z->idata + ioff, c.length))
+                return stbi__err("outofdata", "Corrupt PNG");
+            ioff += c.length;
+            break;
+        }
+
+        case STBI__PNG_TYPE('I', 'E', 'N', 'D'): {
+            stbi__uint32 raw_len, bpl;
+            if (first)
+                return stbi__err("first not IHDR", "Corrupt PNG");
+            if (scan != STBI__SCAN_load)
+                return 1;
+            if (z->idata == NULL)
+                return stbi__err("no IDAT", "Corrupt PNG");
+            // initial guess for decoded data size to avoid unnecessary reallocs
+            bpl = (s->img_x * z->depth + 7) / 8; // bytes per line, per component
+            raw_len = bpl * s->img_y * s->img_n /* pixels */ + s->img_y /* filter mode per row */;
+            z->expanded = (stbi_uc *)stbi_zlib_decode_malloc_guesssize_headerflag((char *)z->idata, ioff, raw_len,
+                                                                                  (int *)&raw_len, !is_iphone);
+            if (z->expanded == NULL)
+                return 0; // zlib should set error
+            STBI_FREE(z->idata);
+            z->idata = NULL;
+            if ((req_comp == s->img_n + 1 && req_comp != 3 && !pal_img_n) || has_trans)
+                s->img_out_n = s->img_n + 1;
+            else
+                s->img_out_n = s->img_n;
+            if (!stbi__create_png_image(z, z->expanded, raw_len, s->img_out_n, z->depth, color, interlace))
+                return 0;
+            if (has_trans) {
+                if (z->depth == 16) {
+                    if (!stbi__compute_transparency16(z, tc16, s->img_out_n))
+                        return 0;
+                } else {
+                    if (!stbi__compute_transparency(z, tc, s->img_out_n))
+                        return 0;
+                }
+            }
+            if (is_iphone && stbi__de_iphone_flag && s->img_out_n > 2)
+                stbi__de_iphone(z);
+            if (pal_img_n) {
+                // pal_img_n == 3 or 4
+                s->img_n = pal_img_n; // record the actual colors we had
+                s->img_out_n = pal_img_n;
+                if (req_comp >= 3)
+                    s->img_out_n = req_comp;
+                if (!stbi__expand_png_palette(z, palette, pal_len, s->img_out_n))
+                    return 0;
+            } else if (has_trans) {
+                // non-paletted image with tRNS -> source image has (constant) alpha
+                ++s->img_n;
+            }
+            STBI_FREE(z->expanded);
+            z->expanded = NULL;
+            // end of PNG chunk, read and skip CRC
+            stbi__get32be(s);
+            return 1;
+        }
+
+        default:
+            // if critical, fail
+            if (first)
+                return stbi__err("first not IHDR", "Corrupt PNG");
+            if ((c.type & (1 << 29)) == 0) {
+#ifndef STBI_NO_FAILURE_STRINGS
+                // not threadsafe
+                static char invalid_chunk[] = "XXXX PNG chunk not known";
+                invalid_chunk[0] = STBI__BYTECAST(c.type >> 24);
+                invalid_chunk[1] = STBI__BYTECAST(c.type >> 16);
+                invalid_chunk[2] = STBI__BYTECAST(c.type >> 8);
+                invalid_chunk[3] = STBI__BYTECAST(c.type >> 0);
+#endif
+                return stbi__err(invalid_chunk, "PNG not supported: unknown PNG chunk type");
+            }
+            stbi__skip(s, c.length);
+            break;
+        }
+        // end of PNG chunk, read and skip CRC
+        stbi__get32be(s);
+    }
+}
+
+static void * stbi__do_png(stbi__png * p, int * x, int * y, int * n, int req_comp, stbi__result_info * ri) {
+    void * result = NULL;
+    if (req_comp < 0 || req_comp > 4)
+        return stbi__errpuc("bad req_comp", "Internal error");
+    if (stbi__parse_png_file(p, STBI__SCAN_load, req_comp)) {
+        if (p->depth <= 8)
+            ri->bits_per_channel = 8;
+        else if (p->depth == 16)
+            ri->bits_per_channel = 16;
+        else
+            return stbi__errpuc("bad bits_per_channel", "PNG not supported: unsupported color depth");
+        result = p->out;
+        p->out = NULL;
+        if (req_comp && req_comp != p->s->img_out_n) {
+            if (ri->bits_per_channel == 8)
+                result = stbi__convert_format((unsigned char *)result, p->s->img_out_n, req_comp, p->s->img_x, p->s->img_y);
+            else
+                result = stbi__convert_format16((stbi__uint16 *)result, p->s->img_out_n, req_comp, p->s->img_x, p->s->img_y);
+            p->s->img_out_n = req_comp;
+            if (result == NULL)
+                return result;
+        }
+        *x = p->s->img_x;
+        *y = p->s->img_y;
+        if (n)
+            *n = p->s->img_n;
+    }
+    STBI_FREE(p->out);
+    p->out = NULL;
+    STBI_FREE(p->expanded);
+    p->expanded = NULL;
+    STBI_FREE(p->idata);
+    p->idata = NULL;
+
+    return result;
+}
+
+static void * stbi__png_load(stbi__context * s, int * x, int * y, int * comp, int req_comp, stbi__result_info * ri) {
+    stbi__png p;
+    p.s = s;
+    return stbi__do_png(&p, x, y, comp, req_comp, ri);
+}
+
+static int stbi__png_test(stbi__context * s) {
+    int r;
+    r = stbi__check_png_header(s);
+    stbi__rewind(s);
+    return r;
+}
+
+static int stbi__png_info_raw(stbi__png * p, int * x, int * y, int * comp) {
+    if (!stbi__parse_png_file(p, STBI__SCAN_header, 0)) {
+        stbi__rewind(p->s);
+        return 0;
+    }
+    if (x)
+        *x = p->s->img_x;
+    if (y)
+        *y = p->s->img_y;
+    if (comp)
+        *comp = p->s->img_n;
+    return 1;
+}
+
+static int stbi__png_info(stbi__context * s, int * x, int * y, int * comp) {
+    stbi__png p;
+    p.s = s;
+    return stbi__png_info_raw(&p, x, y, comp);
+}
+
+static int stbi__png_is16(stbi__context * s) {
+    stbi__png p;
+    p.s = s;
+    if (!stbi__png_info_raw(&p, NULL, NULL, NULL))
+        return 0;
+    if (p.depth != 16) {
+        stbi__rewind(p.s);
+        return 0;
+    }
+    return 1;
+}
+#endif
+
+// Microsoft/Windows BMP image
+
+#ifndef STBI_NO_BMP
+static int stbi__bmp_test_raw(stbi__context * s) {
+    int r;
+    int sz;
+    if (stbi__get8(s) != 'B')
+        return 0;
+    if (stbi__get8(s) != 'M')
+        return 0;
+    stbi__get32le(s); // discard filesize
+    stbi__get16le(s); // discard reserved
+    stbi__get16le(s); // discard reserved
+    stbi__get32le(s); // discard data offset
+    sz = stbi__get32le(s);
+    r = (sz == 12 || sz == 40 || sz == 56 || sz == 108 || sz == 124);
+    return r;
+}
+
+static int stbi__bmp_test(stbi__context * s) {
+    int r = stbi__bmp_test_raw(s);
+    stbi__rewind(s);
+    return r;
+}
+
+// returns 0..31 for the highest set bit
+static int stbi__high_bit(unsigned int z) {
+    int n = 0;
+    if (z == 0)
+        return -1;
+    if (z >= 0x10000) {
+        n += 16;
+        z >>= 16;
+    }
+    if (z >= 0x00100) {
+        n += 8;
+        z >>= 8;
+    }
+    if (z >= 0x00010) {
+        n += 4;
+        z >>= 4;
+    }
+    if (z >= 0x00004) {
+        n += 2;
+        z >>= 2;
+    }
+    if (z >= 0x00002) {
+        n += 1; /* >>=  1;*/
+    }
+    return n;
+}
+
+static int stbi__bitcount(unsigned int a) {
+    a = (a & 0x55555555) + ((a >> 1) & 0x55555555); // max 2
+    a = (a & 0x33333333) + ((a >> 2) & 0x33333333); // max 4
+    a = (a + (a >> 4)) & 0x0f0f0f0f;                // max 8 per 4, now 8 bits
+    a = (a + (a >> 8));                             // max 16 per 8 bits
+    a = (a + (a >> 16));                            // max 32 per 8 bits
+    return a & 0xff;
+}
+
+// extract an arbitrarily-aligned N-bit value (N=bits)
+// from v, and then make it 8-bits long and fractionally
+// extend it to full full range.
+static int stbi__shiftsigned(unsigned int v, int shift, int bits) {
+    static unsigned int mul_table[9] = {
+        0,
+        0xff /*0b11111111*/,
+        0x55 /*0b01010101*/,
+        0x49 /*0b01001001*/,
+        0x11 /*0b00010001*/,
+        0x21 /*0b00100001*/,
+        0x41 /*0b01000001*/,
+        0x81 /*0b10000001*/,
+        0x01 /*0b00000001*/,
+    };
+    static unsigned int shift_table[9] = {
+        0, 0, 0, 1, 0, 2, 4, 6, 0,
+    };
+    if (shift < 0)
+        v <<= -shift;
+    else
+        v >>= shift;
+    STBI_ASSERT(v < 256);
+    v >>= (8 - bits);
+    STBI_ASSERT(bits >= 0 && bits <= 8);
+    return (int)((unsigned)v * mul_table[bits]) >> shift_table[bits];
+}
+
+typedef struct {
+    int bpp, offset, hsz;
+    unsigned int mr, mg, mb, ma, all_a;
+    int extra_read;
+} stbi__bmp_data;
+
+static int stbi__bmp_set_mask_defaults(stbi__bmp_data * info, int compress) {
+    // BI_BITFIELDS specifies masks explicitly, don't override
+    if (compress == 3)
+        return 1;
+
+    if (compress == 0) {
+        if (info->bpp == 16) {
+            info->mr = 31u << 10;
+            info->mg = 31u << 5;
+            info->mb = 31u << 0;
+        } else if (info->bpp == 32) {
+            info->mr = 0xffu << 16;
+            info->mg = 0xffu << 8;
+            info->mb = 0xffu << 0;
+            info->ma = 0xffu << 24;
+            info->all_a = 0; // if all_a is 0 at end, then we loaded alpha channel but it was all 0
+        } else {
+            // otherwise, use defaults, which is all-0
+            info->mr = info->mg = info->mb = info->ma = 0;
+        }
+        return 1;
+    }
+    return 0; // error
+}
+
+static void * stbi__bmp_parse_header(stbi__context * s, stbi__bmp_data * info) {
+    int hsz;
+    if (stbi__get8(s) != 'B' || stbi__get8(s) != 'M')
+        return stbi__errpuc("not BMP", "Corrupt BMP");
+    stbi__get32le(s); // discard filesize
+    stbi__get16le(s); // discard reserved
+    stbi__get16le(s); // discard reserved
+    info->offset = stbi__get32le(s);
+    info->hsz = hsz = stbi__get32le(s);
+    info->mr = info->mg = info->mb = info->ma = 0;
+    info->extra_read = 14;
+
+    if (info->offset < 0)
+        return stbi__errpuc("bad BMP", "bad BMP");
+
+    if (hsz != 12 && hsz != 40 && hsz != 56 && hsz != 108 && hsz != 124)
+        return stbi__errpuc("unknown BMP", "BMP type not supported: unknown");
+    if (hsz == 12) {
+        s->img_x = stbi__get16le(s);
+        s->img_y = stbi__get16le(s);
+    } else {
+        s->img_x = stbi__get32le(s);
+        s->img_y = stbi__get32le(s);
+    }
+    if (stbi__get16le(s) != 1)
+        return stbi__errpuc("bad BMP", "bad BMP");
+    info->bpp = stbi__get16le(s);
+    if (hsz != 12) {
+        int compress = stbi__get32le(s);
+        if (compress == 1 || compress == 2)
+            return stbi__errpuc("BMP RLE", "BMP type not supported: RLE");
+        if (compress >= 4)
+            return stbi__errpuc("BMP JPEG/PNG",
+                                "BMP type not supported: unsupported compression"); // this includes PNG/JPEG modes
+        if (compress == 3 && info->bpp != 16 && info->bpp != 32)
+            return stbi__errpuc("bad BMP", "bad BMP"); // bitfields requires 16 or 32 bits/pixel
+        stbi__get32le(s);                              // discard sizeof
+        stbi__get32le(s);                              // discard hres
+        stbi__get32le(s);                              // discard vres
+        stbi__get32le(s);                              // discard colorsused
+        stbi__get32le(s);                              // discard max important
+        if (hsz == 40 || hsz == 56) {
+            if (hsz == 56) {
+                stbi__get32le(s);
+                stbi__get32le(s);
+                stbi__get32le(s);
+                stbi__get32le(s);
+            }
+            if (info->bpp == 16 || info->bpp == 32) {
+                if (compress == 0) {
+                    stbi__bmp_set_mask_defaults(info, compress);
+                } else if (compress == 3) {
+                    info->mr = stbi__get32le(s);
+                    info->mg = stbi__get32le(s);
+                    info->mb = stbi__get32le(s);
+                    info->extra_read += 12;
+                    // not documented, but generated by photoshop and handled by mspaint
+                    if (info->mr == info->mg && info->mg == info->mb) {
+                        // ?!?!?
+                        return stbi__errpuc("bad BMP", "bad BMP");
+                    }
+                } else
+                    return stbi__errpuc("bad BMP", "bad BMP");
+            }
+        } else {
+            // V4/V5 header
+            int i;
+            if (hsz != 108 && hsz != 124)
+                return stbi__errpuc("bad BMP", "bad BMP");
+            info->mr = stbi__get32le(s);
+            info->mg = stbi__get32le(s);
+            info->mb = stbi__get32le(s);
+            info->ma = stbi__get32le(s);
+            if (compress != 3) // override mr/mg/mb unless in BI_BITFIELDS mode, as per docs
+                stbi__bmp_set_mask_defaults(info, compress);
+            stbi__get32le(s); // discard color space
+            for (i = 0; i < 12; ++i)
+                stbi__get32le(s); // discard color space parameters
+            if (hsz == 124) {
+                stbi__get32le(s); // discard rendering intent
+                stbi__get32le(s); // discard offset of profile data
+                stbi__get32le(s); // discard size of profile data
+                stbi__get32le(s); // discard reserved
+            }
+        }
+    }
+    return (void *)1;
+}
+
+static void * stbi__bmp_load(stbi__context * s, int * x, int * y, int * comp, int req_comp, stbi__result_info * ri) {
+    stbi_uc * out;
+    unsigned int mr = 0, mg = 0, mb = 0, ma = 0, all_a;
+    stbi_uc pal[256][4];
+    int psize = 0, i, j, width;
+    int flip_vertically, pad, target;
+    stbi__bmp_data info;
+    STBI_NOTUSED(ri);
+
+    info.all_a = 255;
+    if (stbi__bmp_parse_header(s, &info) == NULL)
+        return NULL; // error code already set
+
+    flip_vertically = ((int)s->img_y) > 0;
+    s->img_y = abs((int)s->img_y);
+
+    if (s->img_y > STBI_MAX_DIMENSIONS)
+        return stbi__errpuc("too large", "Very large image (corrupt?)");
+    if (s->img_x > STBI_MAX_DIMENSIONS)
+        return stbi__errpuc("too large", "Very large image (corrupt?)");
+
+    mr = info.mr;
+    mg = info.mg;
+    mb = info.mb;
+    ma = info.ma;
+    all_a = info.all_a;
+
+    if (info.hsz == 12) {
+        if (info.bpp < 24)
+            psize = (info.offset - info.extra_read - 24) / 3;
+    } else {
+        if (info.bpp < 16)
+            psize = (info.offset - info.extra_read - info.hsz) >> 2;
+    }
+    if (psize == 0) {
+        // accept some number of extra bytes after the header, but if the offset points either to before
+        // the header ends or implies a large amount of extra data, reject the file as malformed
+        int bytes_read_so_far = s->callback_already_read + (int)(s->img_buffer - s->img_buffer_original);
+        int header_limit = 1024;        // max we actually read is below 256 bytes currently.
+        int extra_data_limit = 256 * 4; // what ordinarily goes here is a palette; 256 entries*4 bytes is its max size.
+        if (bytes_read_so_far <= 0 || bytes_read_so_far > header_limit) {
+            return stbi__errpuc("bad header", "Corrupt BMP");
+        }
+        // we established that bytes_read_so_far is positive and sensible.
+        // the first half of this test rejects offsets that are either too small positives, or
+        // negative, and guarantees that info.offset >= bytes_read_so_far > 0. this in turn
+        // ensures the number computed in the second half of the test can't overflow.
+        if (info.offset < bytes_read_so_far || info.offset - bytes_read_so_far > extra_data_limit) {
+            return stbi__errpuc("bad offset", "Corrupt BMP");
+        } else {
+            stbi__skip(s, info.offset - bytes_read_so_far);
+        }
+    }
+
+    if (info.bpp == 24 && ma == 0xff000000)
+        s->img_n = 3;
+    else
+        s->img_n = ma ? 4 : 3;
+    if (req_comp && req_comp >= 3) // we can directly decode 3 or 4
+        target = req_comp;
+    else
+        target = s->img_n; // if they want monochrome, we'll post-convert
+
+    // sanity-check size
+    if (!stbi__mad3sizes_valid(target, s->img_x, s->img_y, 0))
+        return stbi__errpuc("too large", "Corrupt BMP");
+
+    out = (stbi_uc *)stbi__malloc_mad3(target, s->img_x, s->img_y, 0);
+    if (!out)
+        return stbi__errpuc("outofmem", "Out of memory");
+    if (info.bpp < 16) {
+        int z = 0;
+        if (psize == 0 || psize > 256) {
+            STBI_FREE(out);
+            return stbi__errpuc("invalid", "Corrupt BMP");
+        }
+        for (i = 0; i < psize; ++i) {
+            pal[i][2] = stbi__get8(s);
+            pal[i][1] = stbi__get8(s);
+            pal[i][0] = stbi__get8(s);
+            if (info.hsz != 12)
+                stbi__get8(s);
+            pal[i][3] = 255;
+        }
+        stbi__skip(s, info.offset - info.extra_read - info.hsz - psize * (info.hsz == 12 ? 3 : 4));
+        if (info.bpp == 1)
+            width = (s->img_x + 7) >> 3;
+        else if (info.bpp == 4)
+            width = (s->img_x + 1) >> 1;
+        else if (info.bpp == 8)
+            width = s->img_x;
+        else {
+            STBI_FREE(out);
+            return stbi__errpuc("bad bpp", "Corrupt BMP");
+        }
+        pad = (-width) & 3;
+        if (info.bpp == 1) {
+            for (j = 0; j < (int)s->img_y; ++j) {
+                int bit_offset = 7, v = stbi__get8(s);
+                for (i = 0; i < (int)s->img_x; ++i) {
+                    int color = (v >> bit_offset) & 0x1;
+                    out[z++] = pal[color][0];
+                    out[z++] = pal[color][1];
+                    out[z++] = pal[color][2];
+                    if (target == 4)
+                        out[z++] = 255;
+                    if (i + 1 == (int)s->img_x)
+                        break;
+                    if ((--bit_offset) < 0) {
+                        bit_offset = 7;
+                        v = stbi__get8(s);
+                    }
+                }
+                stbi__skip(s, pad);
+            }
+        } else {
+            for (j = 0; j < (int)s->img_y; ++j) {
+                for (i = 0; i < (int)s->img_x; i += 2) {
+                    int v = stbi__get8(s), v2 = 0;
+                    if (info.bpp == 4) {
+                        v2 = v & 15;
+                        v >>= 4;
+                    }
+                    out[z++] = pal[v][0];
+                    out[z++] = pal[v][1];
+                    out[z++] = pal[v][2];
+                    if (target == 4)
+                        out[z++] = 255;
+                    if (i + 1 == (int)s->img_x)
+                        break;
+                    v = (info.bpp == 8) ? stbi__get8(s) : v2;
+                    out[z++] = pal[v][0];
+                    out[z++] = pal[v][1];
+                    out[z++] = pal[v][2];
+                    if (target == 4)
+                        out[z++] = 255;
+                }
+                stbi__skip(s, pad);
+            }
+        }
+    } else {
+        int rshift = 0, gshift = 0, bshift = 0, ashift = 0, rcount = 0, gcount = 0, bcount = 0, acount = 0;
+        int z = 0;
+        int easy = 0;
+        stbi__skip(s, info.offset - info.extra_read - info.hsz);
+        if (info.bpp == 24)
+            width = 3 * s->img_x;
+        else if (info.bpp == 16)
+            width = 2 * s->img_x;
+        else /* bpp = 32 and pad = 0 */
+            width = 0;
+        pad = (-width) & 3;
+        if (info.bpp == 24) {
+            easy = 1;
+        } else if (info.bpp == 32) {
+            if (mb == 0xff && mg == 0xff00 && mr == 0x00ff0000 && ma == 0xff000000)
+                easy = 2;
+        }
+        if (!easy) {
+            if (!mr || !mg || !mb) {
+                STBI_FREE(out);
+                return stbi__errpuc("bad masks", "Corrupt BMP");
+            }
+            // right shift amt to put high bit in position #7
+            rshift = stbi__high_bit(mr) - 7;
+            rcount = stbi__bitcount(mr);
+            gshift = stbi__high_bit(mg) - 7;
+            gcount = stbi__bitcount(mg);
+            bshift = stbi__high_bit(mb) - 7;
+            bcount = stbi__bitcount(mb);
+            ashift = stbi__high_bit(ma) - 7;
+            acount = stbi__bitcount(ma);
+            if (rcount > 8 || gcount > 8 || bcount > 8 || acount > 8) {
+                STBI_FREE(out);
+                return stbi__errpuc("bad masks", "Corrupt BMP");
+            }
+        }
+        for (j = 0; j < (int)s->img_y; ++j) {
+            if (easy) {
+                for (i = 0; i < (int)s->img_x; ++i) {
+                    unsigned char a;
+                    out[z + 2] = stbi__get8(s);
+                    out[z + 1] = stbi__get8(s);
+                    out[z + 0] = stbi__get8(s);
+                    z += 3;
+                    a = (easy == 2 ? stbi__get8(s) : 255);
+                    all_a |= a;
+                    if (target == 4)
+                        out[z++] = a;
+                }
+            } else {
+                int bpp = info.bpp;
+                for (i = 0; i < (int)s->img_x; ++i) {
+                    stbi__uint32 v = (bpp == 16 ? (stbi__uint32)stbi__get16le(s) : stbi__get32le(s));
+                    unsigned int a;
+                    out[z++] = STBI__BYTECAST(stbi__shiftsigned(v & mr, rshift, rcount));
+                    out[z++] = STBI__BYTECAST(stbi__shiftsigned(v & mg, gshift, gcount));
+                    out[z++] = STBI__BYTECAST(stbi__shiftsigned(v & mb, bshift, bcount));
+                    a = (ma ? stbi__shiftsigned(v & ma, ashift, acount) : 255);
+                    all_a |= a;
+                    if (target == 4)
+                        out[z++] = STBI__BYTECAST(a);
+                }
+            }
+            stbi__skip(s, pad);
+        }
+    }
+
+    // if alpha channel is all 0s, replace with all 255s
+    if (target == 4 && all_a == 0)
+        for (i = 4 * s->img_x * s->img_y - 1; i >= 0; i -= 4)
+            out[i] = 255;
+
+    if (flip_vertically) {
+        stbi_uc t;
+        for (j = 0; j < (int)s->img_y >> 1; ++j) {
+            stbi_uc * p1 = out + j * s->img_x * target;
+            stbi_uc * p2 = out + (s->img_y - 1 - j) * s->img_x * target;
+            for (i = 0; i < (int)s->img_x * target; ++i) {
+                t = p1[i];
+                p1[i] = p2[i];
+                p2[i] = t;
+            }
+        }
+    }
+
+    if (req_comp && req_comp != target) {
+        out = stbi__convert_format(out, target, req_comp, s->img_x, s->img_y);
+        if (out == NULL)
+            return out; // stbi__convert_format frees input on failure
+    }
+
+    *x = s->img_x;
+    *y = s->img_y;
+    if (comp)
+        *comp = s->img_n;
+    return out;
+}
+#endif
+
+// Targa Truevision - TGA
+// by Jonathan Dummer
+#ifndef STBI_NO_TGA
+// returns STBI_rgb or whatever, 0 on error
+static int stbi__tga_get_comp(int bits_per_pixel, int is_grey, int * is_rgb16) {
+    // only RGB or RGBA (incl. 16bit) or grey allowed
+    if (is_rgb16)
+        *is_rgb16 = 0;
+    switch (bits_per_pixel) {
+    case 8:
+        return STBI_grey;
+    case 16:
+        if (is_grey)
+            return STBI_grey_alpha;
+        // fallthrough
+    case 15:
+        if (is_rgb16)
+            *is_rgb16 = 1;
+        return STBI_rgb;
+    case 24: // fallthrough
+    case 32:
+        return bits_per_pixel / 8;
+    default:
+        return 0;
+    }
+}
+
+static int stbi__tga_info(stbi__context * s, int * x, int * y, int * comp) {
+    int tga_w, tga_h, tga_comp, tga_image_type, tga_bits_per_pixel, tga_colormap_bpp;
+    int sz, tga_colormap_type;
+    stbi__get8(s);                     // discard Offset
+    tga_colormap_type = stbi__get8(s); // colormap type
+    if (tga_colormap_type > 1) {
+        stbi__rewind(s);
+        return 0; // only RGB or indexed allowed
+    }
+    tga_image_type = stbi__get8(s); // image type
+    if (tga_colormap_type == 1) {   // colormapped (paletted) image
+        if (tga_image_type != 1 && tga_image_type != 9) {
+            stbi__rewind(s);
+            return 0;
+        }
+        stbi__skip(s, 4);   // skip index of first colormap entry and number of entries
+        sz = stbi__get8(s); //   check bits per palette color entry
+        if ((sz != 8) && (sz != 15) && (sz != 16) && (sz != 24) && (sz != 32)) {
+            stbi__rewind(s);
+            return 0;
+        }
+        stbi__skip(s, 4); // skip image x and y origin
+        tga_colormap_bpp = sz;
+    } else { // "normal" image w/o colormap - only RGB or grey allowed, +/- RLE
+        if ((tga_image_type != 2) && (tga_image_type != 3) && (tga_image_type != 10) && (tga_image_type != 11)) {
+            stbi__rewind(s);
+            return 0; // only RGB or grey allowed, +/- RLE
+        }
+        stbi__skip(s, 9); // skip colormap specification and image x/y origin
+        tga_colormap_bpp = 0;
+    }
+    tga_w = stbi__get16le(s);
+    if (tga_w < 1) {
+        stbi__rewind(s);
+        return 0; // test width
+    }
+    tga_h = stbi__get16le(s);
+    if (tga_h < 1) {
+        stbi__rewind(s);
+        return 0; // test height
+    }
+    tga_bits_per_pixel = stbi__get8(s); // bits per pixel
+    stbi__get8(s);                      // ignore alpha bits
+    if (tga_colormap_bpp != 0) {
+        if ((tga_bits_per_pixel != 8) && (tga_bits_per_pixel != 16)) {
+            // when using a colormap, tga_bits_per_pixel is the size of the indexes
+            // I don't think anything but 8 or 16bit indexes makes sense
+            stbi__rewind(s);
+            return 0;
+        }
+        tga_comp = stbi__tga_get_comp(tga_colormap_bpp, 0, NULL);
+    } else {
+        tga_comp = stbi__tga_get_comp(tga_bits_per_pixel, (tga_image_type == 3) || (tga_image_type == 11), NULL);
+    }
+    if (!tga_comp) {
+        stbi__rewind(s);
+        return 0;
+    }
+    if (x)
+        *x = tga_w;
+    if (y)
+        *y = tga_h;
+    if (comp)
+        *comp = tga_comp;
+    return 1; // seems to have passed everything
+}
+
+static int stbi__tga_test(stbi__context * s) {
+    int res = 0;
+    int sz, tga_color_type;
+    stbi__get8(s);                  //   discard Offset
+    tga_color_type = stbi__get8(s); //   color type
+    if (tga_color_type > 1)
+        goto errorEnd;         //   only RGB or indexed allowed
+    sz = stbi__get8(s);        //   image type
+    if (tga_color_type == 1) { // colormapped (paletted) image
+        if (sz != 1 && sz != 9)
+            goto errorEnd;  // colortype 1 demands image type 1 or 9
+        stbi__skip(s, 4);   // skip index of first colormap entry and number of entries
+        sz = stbi__get8(s); //   check bits per palette color entry
+        if ((sz != 8) && (sz != 15) && (sz != 16) && (sz != 24) && (sz != 32))
+            goto errorEnd;
+        stbi__skip(s, 4); // skip image x and y origin
+    } else {              // "normal" image w/o colormap
+        if ((sz != 2) && (sz != 3) && (sz != 10) && (sz != 11))
+            goto errorEnd; // only RGB or grey allowed, +/- RLE
+        stbi__skip(s, 9);  // skip colormap specification and image x/y origin
+    }
+    if (stbi__get16le(s) < 1)
+        goto errorEnd; //   test width
+    if (stbi__get16le(s) < 1)
+        goto errorEnd;  //   test height
+    sz = stbi__get8(s); //   bits per pixel
+    if ((tga_color_type == 1) && (sz != 8) && (sz != 16))
+        goto errorEnd; // for colormapped images, bpp is size of an index
+    if ((sz != 8) && (sz != 15) && (sz != 16) && (sz != 24) && (sz != 32))
+        goto errorEnd;
+
+    res = 1; // if we got this far, everything's good and we can return 1 instead of 0
+
+errorEnd:
+    stbi__rewind(s);
+    return res;
+}
+
+// read 16bit value and convert to 24bit RGB
+static void stbi__tga_read_rgb16(stbi__context * s, stbi_uc * out) {
+    stbi__uint16 px = (stbi__uint16)stbi__get16le(s);
+    stbi__uint16 fiveBitMask = 31;
+    // we have 3 channels with 5bits each
+    int r = (px >> 10) & fiveBitMask;
+    int g = (px >> 5) & fiveBitMask;
+    int b = px & fiveBitMask;
+    // Note that this saves the data in RGB(A) order, so it doesn't need to be swapped later
+    out[0] = (stbi_uc)((r * 255) / 31);
+    out[1] = (stbi_uc)((g * 255) / 31);
+    out[2] = (stbi_uc)((b * 255) / 31);
+
+    // some people claim that the most significant bit might be used for alpha
+    // (possibly if an alpha-bit is set in the "image descriptor byte")
+    // but that only made 16bit test images completely translucent..
+    // so let's treat all 15 and 16bit TGAs as RGB with no alpha.
+}
+
+static void * stbi__tga_load(stbi__context * s, int * x, int * y, int * comp, int req_comp, stbi__result_info * ri) {
+    //   read in the TGA header stuff
+    int tga_offset = stbi__get8(s);
+    int tga_indexed = stbi__get8(s);
+    int tga_image_type = stbi__get8(s);
+    int tga_is_RLE = 0;
+    int tga_palette_start = stbi__get16le(s);
+    int tga_palette_len = stbi__get16le(s);
+    int tga_palette_bits = stbi__get8(s);
+    int tga_x_origin = stbi__get16le(s);
+    int tga_y_origin = stbi__get16le(s);
+    int tga_width = stbi__get16le(s);
+    int tga_height = stbi__get16le(s);
+    int tga_bits_per_pixel = stbi__get8(s);
+    int tga_comp, tga_rgb16 = 0;
+    int tga_inverted = stbi__get8(s);
+    // int tga_alpha_bits = tga_inverted & 15; // the 4 lowest bits - unused (useless?)
+    //   image data
+    unsigned char * tga_data;
+    unsigned char * tga_palette = NULL;
+    int i, j;
+    unsigned char raw_data[4] = {0};
+    int RLE_count = 0;
+    int RLE_repeating = 0;
+    int read_next_pixel = 1;
+    STBI_NOTUSED(ri);
+    STBI_NOTUSED(tga_x_origin); // @TODO
+    STBI_NOTUSED(tga_y_origin); // @TODO
+
+    if (tga_height > STBI_MAX_DIMENSIONS)
+        return stbi__errpuc("too large", "Very large image (corrupt?)");
+    if (tga_width > STBI_MAX_DIMENSIONS)
+        return stbi__errpuc("too large", "Very large image (corrupt?)");
+
+    //   do a tiny bit of precessing
+    if (tga_image_type >= 8) {
+        tga_image_type -= 8;
+        tga_is_RLE = 1;
+    }
+    tga_inverted = 1 - ((tga_inverted >> 5) & 1);
+
+    //   If I'm paletted, then I'll use the number of bits from the palette
+    if (tga_indexed)
+        tga_comp = stbi__tga_get_comp(tga_palette_bits, 0, &tga_rgb16);
+    else
+        tga_comp = stbi__tga_get_comp(tga_bits_per_pixel, (tga_image_type == 3), &tga_rgb16);
+
+    if (!tga_comp) // shouldn't really happen, stbi__tga_test() should have ensured basic consistency
+        return stbi__errpuc("bad format", "Can't find out TGA pixelformat");
+
+    //   tga info
+    *x = tga_width;
+    *y = tga_height;
+    if (comp)
+        *comp = tga_comp;
+
+    if (!stbi__mad3sizes_valid(tga_width, tga_height, tga_comp, 0))
+        return stbi__errpuc("too large", "Corrupt TGA");
+
+    tga_data = (unsigned char *)stbi__malloc_mad3(tga_width, tga_height, tga_comp, 0);
+    if (!tga_data)
+        return stbi__errpuc("outofmem", "Out of memory");
+
+    // skip to the data's starting position (offset usually = 0)
+    stbi__skip(s, tga_offset);
+
+    if (!tga_indexed && !tga_is_RLE && !tga_rgb16) {
+        for (i = 0; i < tga_height; ++i) {
+            int row = tga_inverted ? tga_height - i - 1 : i;
+            stbi_uc * tga_row = tga_data + row * tga_width * tga_comp;
+            stbi__getn(s, tga_row, tga_width * tga_comp);
+        }
+    } else {
+        //   do I need to load a palette?
+        if (tga_indexed) {
+            if (tga_palette_len == 0) { /* you have to have at least one entry! */
+                STBI_FREE(tga_data);
+                return stbi__errpuc("bad palette", "Corrupt TGA");
+            }
+
+            //   any data to skip? (offset usually = 0)
+            stbi__skip(s, tga_palette_start);
+            //   load the palette
+            tga_palette = (unsigned char *)stbi__malloc_mad2(tga_palette_len, tga_comp, 0);
+            if (!tga_palette) {
+                STBI_FREE(tga_data);
+                return stbi__errpuc("outofmem", "Out of memory");
+            }
+            if (tga_rgb16) {
+                stbi_uc * pal_entry = tga_palette;
+                STBI_ASSERT(tga_comp == STBI_rgb);
+                for (i = 0; i < tga_palette_len; ++i) {
+                    stbi__tga_read_rgb16(s, pal_entry);
+                    pal_entry += tga_comp;
+                }
+            } else if (!stbi__getn(s, tga_palette, tga_palette_len * tga_comp)) {
+                STBI_FREE(tga_data);
+                STBI_FREE(tga_palette);
+                return stbi__errpuc("bad palette", "Corrupt TGA");
+            }
+        }
+        //   load the data
+        for (i = 0; i < tga_width * tga_height; ++i) {
+            //   if I'm in RLE mode, do I need to get a RLE stbi__pngchunk?
+            if (tga_is_RLE) {
+                if (RLE_count == 0) {
+                    //   yep, get the next byte as a RLE command
+                    int RLE_cmd = stbi__get8(s);
+                    RLE_count = 1 + (RLE_cmd & 127);
+                    RLE_repeating = RLE_cmd >> 7;
+                    read_next_pixel = 1;
+                } else if (!RLE_repeating) {
+                    read_next_pixel = 1;
+                }
+            } else {
+                read_next_pixel = 1;
+            }
+            //   OK, if I need to read a pixel, do it now
+            if (read_next_pixel) {
+                //   load however much data we did have
+                if (tga_indexed) {
+                    // read in index, then perform the lookup
+                    int pal_idx = (tga_bits_per_pixel == 8) ? stbi__get8(s) : stbi__get16le(s);
+                    if (pal_idx >= tga_palette_len) {
+                        // invalid index
+                        pal_idx = 0;
+                    }
+                    pal_idx *= tga_comp;
+                    for (j = 0; j < tga_comp; ++j) {
+                        raw_data[j] = tga_palette[pal_idx + j];
+                    }
+                } else if (tga_rgb16) {
+                    STBI_ASSERT(tga_comp == STBI_rgb);
+                    stbi__tga_read_rgb16(s, raw_data);
+                } else {
+                    //   read in the data raw
+                    for (j = 0; j < tga_comp; ++j) {
+                        raw_data[j] = stbi__get8(s);
+                    }
+                }
+                //   clear the reading flag for the next pixel
+                read_next_pixel = 0;
+            } // end of reading a pixel
+
+            // copy data
+            for (j = 0; j < tga_comp; ++j)
+                tga_data[i * tga_comp + j] = raw_data[j];
+
+            //   in case we're in RLE mode, keep counting down
+            --RLE_count;
+        }
+        //   do I need to invert the image?
+        if (tga_inverted) {
+            for (j = 0; j * 2 < tga_height; ++j) {
+                int index1 = j * tga_width * tga_comp;
+                int index2 = (tga_height - 1 - j) * tga_width * tga_comp;
+                for (i = tga_width * tga_comp; i > 0; --i) {
+                    unsigned char temp = tga_data[index1];
+                    tga_data[index1] = tga_data[index2];
+                    tga_data[index2] = temp;
+                    ++index1;
+                    ++index2;
+                }
+            }
+        }
+        //   clear my palette, if I had one
+        if (tga_palette != NULL) {
+            STBI_FREE(tga_palette);
+        }
+    }
+
+    // swap RGB - if the source data was RGB16, it already is in the right order
+    if (tga_comp >= 3 && !tga_rgb16) {
+        unsigned char * tga_pixel = tga_data;
+        for (i = 0; i < tga_width * tga_height; ++i) {
+            unsigned char temp = tga_pixel[0];
+            tga_pixel[0] = tga_pixel[2];
+            tga_pixel[2] = temp;
+            tga_pixel += tga_comp;
+        }
+    }
+
+    // convert to target component count
+    if (req_comp && req_comp != tga_comp)
+        tga_data = stbi__convert_format(tga_data, tga_comp, req_comp, tga_width, tga_height);
+
+    //   the things I do to get rid of an error message, and yet keep
+    //   Microsoft's C compilers happy... [8^(
+    tga_palette_start = tga_palette_len = tga_palette_bits = tga_x_origin = tga_y_origin = 0;
+    STBI_NOTUSED(tga_palette_start);
+    //   OK, done
+    return tga_data;
+}
+#endif
+
+// *************************************************************************************************
+// Photoshop PSD loader -- PD by Thatcher Ulrich, integration by Nicolas Schulz, tweaked by STB
+
+#ifndef STBI_NO_PSD
+static int stbi__psd_test(stbi__context * s) {
+    int r = (stbi__get32be(s) == 0x38425053);
+    stbi__rewind(s);
+    return r;
+}
+
+static int stbi__psd_decode_rle(stbi__context * s, stbi_uc * p, int pixelCount) {
+    int count, nleft, len;
+
+    count = 0;
+    while ((nleft = pixelCount - count) > 0) {
+        len = stbi__get8(s);
+        if (len == 128) {
+            // No-op.
+        } else if (len < 128) {
+            // Copy next len+1 bytes literally.
+            len++;
+            if (len > nleft)
+                return 0; // corrupt data
+            count += len;
+            while (len) {
+                *p = stbi__get8(s);
+                p += 4;
+                len--;
+            }
+        } else if (len > 128) {
+            stbi_uc val;
+            // Next -len+1 bytes in the dest are replicated from next source byte.
+            // (Interpret len as a negative 8-bit int.)
+            len = 257 - len;
+            if (len > nleft)
+                return 0; // corrupt data
+            val = stbi__get8(s);
+            count += len;
+            while (len) {
+                *p = val;
+                p += 4;
+                len--;
+            }
+        }
+    }
+
+    return 1;
+}
+
+static void * stbi__psd_load(stbi__context * s, int * x, int * y, int * comp, int req_comp, stbi__result_info * ri, int bpc) {
+    int pixelCount;
+    int channelCount, compression;
+    int channel, i;
+    int bitdepth;
+    int w, h;
+    stbi_uc * out;
+    STBI_NOTUSED(ri);
+
+    // Check identifier
+    if (stbi__get32be(s) != 0x38425053) // "8BPS"
+        return stbi__errpuc("not PSD", "Corrupt PSD image");
+
+    // Check file type version.
+    if (stbi__get16be(s) != 1)
+        return stbi__errpuc("wrong version", "Unsupported version of PSD image");
+
+    // Skip 6 reserved bytes.
+    stbi__skip(s, 6);
+
+    // Read the number of channels (R, G, B, A, etc).
+    channelCount = stbi__get16be(s);
+    if (channelCount < 0 || channelCount > 16)
+        return stbi__errpuc("wrong channel count", "Unsupported number of channels in PSD image");
+
+    // Read the rows and columns of the image.
+    h = stbi__get32be(s);
+    w = stbi__get32be(s);
+
+    if (h > STBI_MAX_DIMENSIONS)
+        return stbi__errpuc("too large", "Very large image (corrupt?)");
+    if (w > STBI_MAX_DIMENSIONS)
+        return stbi__errpuc("too large", "Very large image (corrupt?)");
+
+    // Make sure the depth is 8 bits.
+    bitdepth = stbi__get16be(s);
+    if (bitdepth != 8 && bitdepth != 16)
+        return stbi__errpuc("unsupported bit depth", "PSD bit depth is not 8 or 16 bit");
+
+    // Make sure the color mode is RGB.
+    // Valid options are:
+    //   0: Bitmap
+    //   1: Grayscale
+    //   2: Indexed color
+    //   3: RGB color
+    //   4: CMYK color
+    //   7: Multichannel
+    //   8: Duotone
+    //   9: Lab color
+    if (stbi__get16be(s) != 3)
+        return stbi__errpuc("wrong color format", "PSD is not in RGB color format");
+
+    // Skip the Mode Data.  (It's the palette for indexed color; other info for other modes.)
+    stbi__skip(s, stbi__get32be(s));
+
+    // Skip the image resources.  (resolution, pen tool paths, etc)
+    stbi__skip(s, stbi__get32be(s));
+
+    // Skip the reserved data.
+    stbi__skip(s, stbi__get32be(s));
+
+    // Find out if the data is compressed.
+    // Known values:
+    //   0: no compression
+    //   1: RLE compressed
+    compression = stbi__get16be(s);
+    if (compression > 1)
+        return stbi__errpuc("bad compression", "PSD has an unknown compression format");
+
+    // Check size
+    if (!stbi__mad3sizes_valid(4, w, h, 0))
+        return stbi__errpuc("too large", "Corrupt PSD");
+
+    // Create the destination image.
+
+    if (!compression && bitdepth == 16 && bpc == 16) {
+        out = (stbi_uc *)stbi__malloc_mad3(8, w, h, 0);
+        ri->bits_per_channel = 16;
+    } else
+        out = (stbi_uc *)stbi__malloc(4 * w * h);
+
+    if (!out)
+        return stbi__errpuc("outofmem", "Out of memory");
+    pixelCount = w * h;
+
+    // Initialize the data to zero.
+    // memset( out, 0, pixelCount * 4 );
+
+    // Finally, the image data.
+    if (compression) {
+        // RLE as used by .PSD and .TIFF
+        // Loop until you get the number of unpacked bytes you are expecting:
+        //     Read the next source byte into n.
+        //     If n is between 0 and 127 inclusive, copy the next n+1 bytes literally.
+        //     Else if n is between -127 and -1 inclusive, copy the next byte -n+1 times.
+        //     Else if n is 128, noop.
+        // Endloop
+
+        // The RLE-compressed data is preceded by a 2-byte data count for each row in the data,
+        // which we're going to just skip.
+        stbi__skip(s, h * channelCount * 2);
+
+        // Read the RLE data by channel.
+        for (channel = 0; channel < 4; channel++) {
+            stbi_uc * p;
+
+            p = out + channel;
+            if (channel >= channelCount) {
+                // Fill this channel with default data.
+                for (i = 0; i < pixelCount; i++, p += 4)
+                    *p = (channel == 3 ? 255 : 0);
+            } else {
+                // Read the RLE data.
+                if (!stbi__psd_decode_rle(s, p, pixelCount)) {
+                    STBI_FREE(out);
+                    return stbi__errpuc("corrupt", "bad RLE data");
+                }
+            }
+        }
+    } else {
+        // We're at the raw image data.  It's each channel in order (Red, Green, Blue, Alpha, ...)
+        // where each channel consists of an 8-bit (or 16-bit) value for each pixel in the image.
+
+        // Read the data by channel.
+        for (channel = 0; channel < 4; channel++) {
+            if (channel >= channelCount) {
+                // Fill this channel with default data.
+                if (bitdepth == 16 && bpc == 16) {
+                    stbi__uint16 * q = ((stbi__uint16 *)out) + channel;
+                    stbi__uint16 val = channel == 3 ? 65535 : 0;
+                    for (i = 0; i < pixelCount; i++, q += 4)
+                        *q = val;
+                } else {
+                    stbi_uc * p = out + channel;
+                    stbi_uc val = channel == 3 ? 255 : 0;
+                    for (i = 0; i < pixelCount; i++, p += 4)
+                        *p = val;
+                }
+            } else {
+                if (ri->bits_per_channel == 16) { // output bpc
+                    stbi__uint16 * q = ((stbi__uint16 *)out) + channel;
+                    for (i = 0; i < pixelCount; i++, q += 4)
+                        *q = (stbi__uint16)stbi__get16be(s);
+                } else {
+                    stbi_uc * p = out + channel;
+                    if (bitdepth == 16) { // input bpc
+                        for (i = 0; i < pixelCount; i++, p += 4)
+                            *p = (stbi_uc)(stbi__get16be(s) >> 8);
+                    } else {
+                        for (i = 0; i < pixelCount; i++, p += 4)
+                            *p = stbi__get8(s);
+                    }
+                }
+            }
+        }
+    }
+
+    // remove weird white matte from PSD
+    if (channelCount >= 4) {
+        if (ri->bits_per_channel == 16) {
+            for (i = 0; i < w * h; ++i) {
+                stbi__uint16 * pixel = (stbi__uint16 *)out + 4 * i;
+                if (pixel[3] != 0 && pixel[3] != 65535) {
+                    float a = pixel[3] / 65535.0f;
+                    float ra = 1.0f / a;
+                    float inv_a = 65535.0f * (1 - ra);
+                    pixel[0] = (stbi__uint16)(pixel[0] * ra + inv_a);
+                    pixel[1] = (stbi__uint16)(pixel[1] * ra + inv_a);
+                    pixel[2] = (stbi__uint16)(pixel[2] * ra + inv_a);
+                }
+            }
+        } else {
+            for (i = 0; i < w * h; ++i) {
+                unsigned char * pixel = out + 4 * i;
+                if (pixel[3] != 0 && pixel[3] != 255) {
+                    float a = pixel[3] / 255.0f;
+                    float ra = 1.0f / a;
+                    float inv_a = 255.0f * (1 - ra);
+                    pixel[0] = (unsigned char)(pixel[0] * ra + inv_a);
+                    pixel[1] = (unsigned char)(pixel[1] * ra + inv_a);
+                    pixel[2] = (unsigned char)(pixel[2] * ra + inv_a);
+                }
+            }
+        }
+    }
+
+    // convert to desired output format
+    if (req_comp && req_comp != 4) {
+        if (ri->bits_per_channel == 16)
+            out = (stbi_uc *)stbi__convert_format16((stbi__uint16 *)out, 4, req_comp, w, h);
+        else
+            out = stbi__convert_format(out, 4, req_comp, w, h);
+        if (out == NULL)
+            return out; // stbi__convert_format frees input on failure
+    }
+
+    if (comp)
+        *comp = 4;
+    *y = h;
+    *x = w;
+
+    return out;
+}
+#endif
+
+// *************************************************************************************************
+// Softimage PIC loader
+// by Tom Seddon
+//
+// See http://softimage.wiki.softimage.com/index.php/INFO:_PIC_file_format
+// See http://ozviz.wasp.uwa.edu.au/~pbourke/dataformats/softimagepic/
+
+#ifndef STBI_NO_PIC
+static int stbi__pic_is4(stbi__context * s, const char * str) {
+    int i;
+    for (i = 0; i < 4; ++i)
+        if (stbi__get8(s) != (stbi_uc)str[i])
+            return 0;
+
+    return 1;
+}
+
+static int stbi__pic_test_core(stbi__context * s) {
+    int i;
+
+    if (!stbi__pic_is4(s, "\x53\x80\xF6\x34"))
+        return 0;
+
+    for (i = 0; i < 84; ++i)
+        stbi__get8(s);
+
+    if (!stbi__pic_is4(s, "PICT"))
+        return 0;
+
+    return 1;
+}
+
+typedef struct {
+    stbi_uc size, type, channel;
+} stbi__pic_packet;
+
+static stbi_uc * stbi__readval(stbi__context * s, int channel, stbi_uc * dest) {
+    int mask = 0x80, i;
+
+    for (i = 0; i < 4; ++i, mask >>= 1) {
+        if (channel & mask) {
+            if (stbi__at_eof(s))
+                return stbi__errpuc("bad file", "PIC file too short");
+            dest[i] = stbi__get8(s);
+        }
+    }
+
+    return dest;
+}
+
+static void stbi__copyval(int channel, stbi_uc * dest, const stbi_uc * src) {
+    int mask = 0x80, i;
+
+    for (i = 0; i < 4; ++i, mask >>= 1)
+        if (channel & mask)
+            dest[i] = src[i];
+}
+
+static stbi_uc * stbi__pic_load_core(stbi__context * s, int width, int height, int * comp, stbi_uc * result) {
+    int act_comp = 0, num_packets = 0, y, chained;
+    stbi__pic_packet packets[10];
+
+    // this will (should...) cater for even some bizarre stuff like having data
+    // for the same channel in multiple packets.
+    do {
+        stbi__pic_packet * packet;
+
+        if (num_packets == sizeof(packets) / sizeof(packets[0]))
+            return stbi__errpuc("bad format", "too many packets");
+
+        packet = &packets[num_packets++];
+
+        chained = stbi__get8(s);
+        packet->size = stbi__get8(s);
+        packet->type = stbi__get8(s);
+        packet->channel = stbi__get8(s);
+
+        act_comp |= packet->channel;
+
+        if (stbi__at_eof(s))
+            return stbi__errpuc("bad file", "file too short (reading packets)");
+        if (packet->size != 8)
+            return stbi__errpuc("bad format", "packet isn't 8bpp");
+    } while (chained);
+
+    *comp = (act_comp & 0x10 ? 4 : 3); // has alpha channel?
+
+    for (y = 0; y < height; ++y) {
+        int packet_idx;
+
+        for (packet_idx = 0; packet_idx < num_packets; ++packet_idx) {
+            stbi__pic_packet * packet = &packets[packet_idx];
+            stbi_uc * dest = result + y * width * 4;
+
+            switch (packet->type) {
+            default:
+                return stbi__errpuc("bad format", "packet has bad compression type");
+
+            case 0: { // uncompressed
+                int x;
+
+                for (x = 0; x < width; ++x, dest += 4)
+                    if (!stbi__readval(s, packet->channel, dest))
+                        return 0;
+                break;
+            }
+
+            case 1: // Pure RLE
+            {
+                int left = width, i;
+
+                while (left > 0) {
+                    stbi_uc count, value[4];
+
+                    count = stbi__get8(s);
+                    if (stbi__at_eof(s))
+                        return stbi__errpuc("bad file", "file too short (pure read count)");
+
+                    if (count > left)
+                        count = (stbi_uc)left;
+
+                    if (!stbi__readval(s, packet->channel, value))
+                        return 0;
+
+                    for (i = 0; i < count; ++i, dest += 4)
+                        stbi__copyval(packet->channel, dest, value);
+                    left -= count;
+                }
+            } break;
+
+            case 2: { // Mixed RLE
+                int left = width;
+                while (left > 0) {
+                    int count = stbi__get8(s), i;
+                    if (stbi__at_eof(s))
+                        return stbi__errpuc("bad file", "file too short (mixed read count)");
+
+                    if (count >= 128) { // Repeated
+                        stbi_uc value[4];
+
+                        if (count == 128)
+                            count = stbi__get16be(s);
+                        else
+                            count -= 127;
+                        if (count > left)
+                            return stbi__errpuc("bad file", "scanline overrun");
+
+                        if (!stbi__readval(s, packet->channel, value))
+                            return 0;
+
+                        for (i = 0; i < count; ++i, dest += 4)
+                            stbi__copyval(packet->channel, dest, value);
+                    } else { // Raw
+                        ++count;
+                        if (count > left)
+                            return stbi__errpuc("bad file", "scanline overrun");
+
+                        for (i = 0; i < count; ++i, dest += 4)
+                            if (!stbi__readval(s, packet->channel, dest))
+                                return 0;
+                    }
+                    left -= count;
+                }
+                break;
+            }
+            }
+        }
+    }
+
+    return result;
+}
+
+static void * stbi__pic_load(stbi__context * s, int * px, int * py, int * comp, int req_comp, stbi__result_info * ri) {
+    stbi_uc * result;
+    int i, x, y, internal_comp;
+    STBI_NOTUSED(ri);
+
+    if (!comp)
+        comp = &internal_comp;
+
+    for (i = 0; i < 92; ++i)
+        stbi__get8(s);
+
+    x = stbi__get16be(s);
+    y = stbi__get16be(s);
+
+    if (y > STBI_MAX_DIMENSIONS)
+        return stbi__errpuc("too large", "Very large image (corrupt?)");
+    if (x > STBI_MAX_DIMENSIONS)
+        return stbi__errpuc("too large", "Very large image (corrupt?)");
+
+    if (stbi__at_eof(s))
+        return stbi__errpuc("bad file", "file too short (pic header)");
+    if (!stbi__mad3sizes_valid(x, y, 4, 0))
+        return stbi__errpuc("too large", "PIC image too large to decode");
+
+    stbi__get32be(s); // skip `ratio'
+    stbi__get16be(s); // skip `fields'
+    stbi__get16be(s); // skip `pad'
+
+    // intermediate buffer is RGBA
+    result = (stbi_uc *)stbi__malloc_mad3(x, y, 4, 0);
+    if (!result)
+        return stbi__errpuc("outofmem", "Out of memory");
+    memset(result, 0xff, x * y * 4);
+
+    if (!stbi__pic_load_core(s, x, y, comp, result)) {
+        STBI_FREE(result);
+        result = 0;
+    }
+    *px = x;
+    *py = y;
+    if (req_comp == 0)
+        req_comp = *comp;
+    result = stbi__convert_format(result, 4, req_comp, x, y);
+
+    return result;
+}
+
+static int stbi__pic_test(stbi__context * s) {
+    int r = stbi__pic_test_core(s);
+    stbi__rewind(s);
+    return r;
+}
+#endif
+
+// *************************************************************************************************
+// GIF loader -- public domain by Jean-Marc Lienher -- simplified/shrunk by stb
+
+#ifndef STBI_NO_GIF
+typedef struct {
+    stbi__int16 prefix;
+    stbi_uc first;
+    stbi_uc suffix;
+} stbi__gif_lzw;
+
+typedef struct {
+    int w, h;
+    stbi_uc * out;        // output buffer (always 4 components)
+    stbi_uc * background; // The current "background" as far as a gif is concerned
+    stbi_uc * history;
+    int flags, bgindex, ratio, transparent, eflags;
+    stbi_uc pal[256][4];
+    stbi_uc lpal[256][4];
+    stbi__gif_lzw codes[8192];
+    stbi_uc * color_table;
+    int parse, step;
+    int lflags;
+    int start_x, start_y;
+    int max_x, max_y;
+    int cur_x, cur_y;
+    int line_size;
+    int delay;
+} stbi__gif;
+
+static int stbi__gif_test_raw(stbi__context * s) {
+    int sz;
+    if (stbi__get8(s) != 'G' || stbi__get8(s) != 'I' || stbi__get8(s) != 'F' || stbi__get8(s) != '8')
+        return 0;
+    sz = stbi__get8(s);
+    if (sz != '9' && sz != '7')
+        return 0;
+    if (stbi__get8(s) != 'a')
+        return 0;
+    return 1;
+}
+
+static int stbi__gif_test(stbi__context * s) {
+    int r = stbi__gif_test_raw(s);
+    stbi__rewind(s);
+    return r;
+}
+
+static void stbi__gif_parse_colortable(stbi__context * s, stbi_uc pal[256][4], int num_entries, int transp) {
+    int i;
+    for (i = 0; i < num_entries; ++i) {
+        pal[i][2] = stbi__get8(s);
+        pal[i][1] = stbi__get8(s);
+        pal[i][0] = stbi__get8(s);
+        pal[i][3] = transp == i ? 0 : 255;
+    }
+}
+
+static int stbi__gif_header(stbi__context * s, stbi__gif * g, int * comp, int is_info) {
+    stbi_uc version;
+    if (stbi__get8(s) != 'G' || stbi__get8(s) != 'I' || stbi__get8(s) != 'F' || stbi__get8(s) != '8')
+        return stbi__err("not GIF", "Corrupt GIF");
+
+    version = stbi__get8(s);
+    if (version != '7' && version != '9')
+        return stbi__err("not GIF", "Corrupt GIF");
+    if (stbi__get8(s) != 'a')
+        return stbi__err("not GIF", "Corrupt GIF");
+
+    stbi__g_failure_reason = "";
+    g->w = stbi__get16le(s);
+    g->h = stbi__get16le(s);
+    g->flags = stbi__get8(s);
+    g->bgindex = stbi__get8(s);
+    g->ratio = stbi__get8(s);
+    g->transparent = -1;
+
+    if (g->w > STBI_MAX_DIMENSIONS)
+        return stbi__err("too large", "Very large image (corrupt?)");
+    if (g->h > STBI_MAX_DIMENSIONS)
+        return stbi__err("too large", "Very large image (corrupt?)");
+
+    if (comp != 0)
+        *comp = 4; // can't actually tell whether it's 3 or 4 until we parse the comments
+
+    if (is_info)
+        return 1;
+
+    if (g->flags & 0x80)
+        stbi__gif_parse_colortable(s, g->pal, 2 << (g->flags & 7), -1);
+
+    return 1;
+}
+
+static int stbi__gif_info_raw(stbi__context * s, int * x, int * y, int * comp) {
+    stbi__gif * g = (stbi__gif *)stbi__malloc(sizeof(stbi__gif));
+    if (!g)
+        return stbi__err("outofmem", "Out of memory");
+    if (!stbi__gif_header(s, g, comp, 1)) {
+        STBI_FREE(g);
+        stbi__rewind(s);
+        return 0;
+    }
+    if (x)
+        *x = g->w;
+    if (y)
+        *y = g->h;
+    STBI_FREE(g);
+    return 1;
+}
+
+static void stbi__out_gif_code(stbi__gif * g, stbi__uint16 code) {
+    stbi_uc *p, *c;
+    int idx;
+
+    // recurse to decode the prefixes, since the linked-list is backwards,
+    // and working backwards through an interleaved image would be nasty
+    if (g->codes[code].prefix >= 0)
+        stbi__out_gif_code(g, g->codes[code].prefix);
+
+    if (g->cur_y >= g->max_y)
+        return;
+
+    idx = g->cur_x + g->cur_y;
+    p = &g->out[idx];
+    g->history[idx / 4] = 1;
+
+    c = &g->color_table[g->codes[code].suffix * 4];
+    if (c[3] > 128) { // don't render transparent pixels;
+        p[0] = c[2];
+        p[1] = c[1];
+        p[2] = c[0];
+        p[3] = c[3];
+    }
+    g->cur_x += 4;
+
+    if (g->cur_x >= g->max_x) {
+        g->cur_x = g->start_x;
+        g->cur_y += g->step;
+
+        while (g->cur_y >= g->max_y && g->parse > 0) {
+            g->step = (1 << g->parse) * g->line_size;
+            g->cur_y = g->start_y + (g->step >> 1);
+            --g->parse;
+        }
+    }
+}
+
+static stbi_uc * stbi__process_gif_raster(stbi__context * s, stbi__gif * g) {
+    stbi_uc lzw_cs;
+    stbi__int32 len, init_code;
+    stbi__uint32 first;
+    stbi__int32 codesize, codemask, avail, oldcode, bits, valid_bits, clear;
+    stbi__gif_lzw * p;
+
+    lzw_cs = stbi__get8(s);
+    if (lzw_cs > 12)
+        return NULL;
+    clear = 1 << lzw_cs;
+    first = 1;
+    codesize = lzw_cs + 1;
+    codemask = (1 << codesize) - 1;
+    bits = 0;
+    valid_bits = 0;
+    for (init_code = 0; init_code < clear; init_code++) {
+        g->codes[init_code].prefix = -1;
+        g->codes[init_code].first = (stbi_uc)init_code;
+        g->codes[init_code].suffix = (stbi_uc)init_code;
+    }
+
+    // support no starting clear code
+    avail = clear + 2;
+    oldcode = -1;
+
+    len = 0;
+    for (;;) {
+        if (valid_bits < codesize) {
+            if (len == 0) {
+                len = stbi__get8(s); // start new block
+                if (len == 0)
+                    return g->out;
+            }
+            --len;
+            bits |= (stbi__int32)stbi__get8(s) << valid_bits;
+            valid_bits += 8;
+        } else {
+            stbi__int32 code = bits & codemask;
+            bits >>= codesize;
+            valid_bits -= codesize;
+            // @OPTIMIZE: is there some way we can accelerate the non-clear path?
+            if (code == clear) { // clear code
+                codesize = lzw_cs + 1;
+                codemask = (1 << codesize) - 1;
+                avail = clear + 2;
+                oldcode = -1;
+                first = 0;
+            } else if (code == clear + 1) { // end of stream code
+                stbi__skip(s, len);
+                while ((len = stbi__get8(s)) > 0)
+                    stbi__skip(s, len);
+                return g->out;
+            } else if (code <= avail) {
+                if (first) {
+                    return stbi__errpuc("no clear code", "Corrupt GIF");
+                }
+
+                if (oldcode >= 0) {
+                    p = &g->codes[avail++];
+                    if (avail > 8192) {
+                        return stbi__errpuc("too many codes", "Corrupt GIF");
+                    }
+
+                    p->prefix = (stbi__int16)oldcode;
+                    p->first = g->codes[oldcode].first;
+                    p->suffix = (code == avail) ? p->first : g->codes[code].first;
+                } else if (code == avail)
+                    return stbi__errpuc("illegal code in raster", "Corrupt GIF");
+
+                stbi__out_gif_code(g, (stbi__uint16)code);
+
+                if ((avail & codemask) == 0 && avail <= 0x0FFF) {
+                    codesize++;
+                    codemask = (1 << codesize) - 1;
+                }
+
+                oldcode = code;
+            } else {
+                return stbi__errpuc("illegal code in raster", "Corrupt GIF");
+            }
+        }
+    }
+}
+
+// this function is designed to support animated gifs, although stb_image doesn't support it
+// two back is the image from two frames ago, used for a very specific disposal format
+static stbi_uc * stbi__gif_load_next(stbi__context * s, stbi__gif * g, int * comp, int req_comp, stbi_uc * two_back) {
+    int dispose;
+    int first_frame;
+    int pi;
+    int pcount;
+    STBI_NOTUSED(req_comp);
+
+    // on first frame, any non-written pixels get the background colour (non-transparent)
+    first_frame = 0;
+    if (g->out == 0) {
+        if (!stbi__gif_header(s, g, comp, 0))
+            return 0; // stbi__g_failure_reason set by stbi__gif_header
+        if (!stbi__mad3sizes_valid(4, g->w, g->h, 0))
+            return stbi__errpuc("too large", "GIF image is too large");
+        pcount = g->w * g->h;
+        g->out = (stbi_uc *)stbi__malloc(4 * pcount);
+        g->background = (stbi_uc *)stbi__malloc(4 * pcount);
+        g->history = (stbi_uc *)stbi__malloc(pcount);
+        if (!g->out || !g->background || !g->history)
+            return stbi__errpuc("outofmem", "Out of memory");
+
+        // image is treated as "transparent" at the start - ie, nothing overwrites the current background;
+        // background colour is only used for pixels that are not rendered first frame, after that "background"
+        // color refers to the color that was there the previous frame.
+        memset(g->out, 0x00, 4 * pcount);
+        memset(g->background, 0x00, 4 * pcount); // state of the background (starts transparent)
+        memset(g->history, 0x00, pcount);        // pixels that were affected previous frame
+        first_frame = 1;
+    } else {
+        // second frame - how do we dispose of the previous one?
+        dispose = (g->eflags & 0x1C) >> 2;
+        pcount = g->w * g->h;
+
+        if ((dispose == 3) && (two_back == 0)) {
+            dispose = 2; // if I don't have an image to revert back to, default to the old background
+        }
+
+        if (dispose == 3) { // use previous graphic
+            for (pi = 0; pi < pcount; ++pi) {
+                if (g->history[pi]) {
+                    memcpy(&g->out[pi * 4], &two_back[pi * 4], 4);
+                }
+            }
+        } else if (dispose == 2) {
+            // restore what was changed last frame to background before that frame;
+            for (pi = 0; pi < pcount; ++pi) {
+                if (g->history[pi]) {
+                    memcpy(&g->out[pi * 4], &g->background[pi * 4], 4);
+                }
+            }
+        } else {
+            // This is a non-disposal case eithe way, so just
+            // leave the pixels as is, and they will become the new background
+            // 1: do not dispose
+            // 0:  not specified.
+        }
+
+        // background is what out is after the undoing of the previou frame;
+        memcpy(g->background, g->out, 4 * g->w * g->h);
+    }
+
+    // clear my history;
+    memset(g->history, 0x00, g->w * g->h); // pixels that were affected previous frame
+
+    for (;;) {
+        int tag = stbi__get8(s);
+        switch (tag) {
+        case 0x2C: /* Image Descriptor */
+        {
+            stbi__int32 x, y, w, h;
+            stbi_uc * o;
+
+            x = stbi__get16le(s);
+            y = stbi__get16le(s);
+            w = stbi__get16le(s);
+            h = stbi__get16le(s);
+            if (((x + w) > (g->w)) || ((y + h) > (g->h)))
+                return stbi__errpuc("bad Image Descriptor", "Corrupt GIF");
+
+            g->line_size = g->w * 4;
+            g->start_x = x * 4;
+            g->start_y = y * g->line_size;
+            g->max_x = g->start_x + w * 4;
+            g->max_y = g->start_y + h * g->line_size;
+            g->cur_x = g->start_x;
+            g->cur_y = g->start_y;
+
+            // if the width of the specified rectangle is 0, that means
+            // we may not see *any* pixels or the image is malformed;
+            // to make sure this is caught, move the current y down to
+            // max_y (which is what out_gif_code checks).
+            if (w == 0)
+                g->cur_y = g->max_y;
+
+            g->lflags = stbi__get8(s);
+
+            if (g->lflags & 0x40) {
+                g->step = 8 * g->line_size; // first interlaced spacing
+                g->parse = 3;
+            } else {
+                g->step = g->line_size;
+                g->parse = 0;
+            }
+
+            if (g->lflags & 0x80) {
+                stbi__gif_parse_colortable(s, g->lpal, 2 << (g->lflags & 7), g->eflags & 0x01 ? g->transparent : -1);
+                g->color_table = (stbi_uc *)g->lpal;
+            } else if (g->flags & 0x80) {
+                g->color_table = (stbi_uc *)g->pal;
+            } else
+                return stbi__errpuc("missing color table", "Corrupt GIF");
+
+            o = stbi__process_gif_raster(s, g);
+            if (!o)
+                return NULL;
+
+            // if this was the first frame,
+            pcount = g->w * g->h;
+            if (first_frame && (g->bgindex > 0)) {
+                // if first frame, any pixel not drawn to gets the background color
+                for (pi = 0; pi < pcount; ++pi) {
+                    if (g->history[pi] == 0) {
+                        g->pal[g->bgindex][3] =
+                            255; // just in case it was made transparent, undo that; It will be reset next frame if need be;
+                        memcpy(&g->out[pi * 4], &g->pal[g->bgindex], 4);
+                    }
+                }
+            }
+
+            return o;
+        }
+
+        case 0x21: // Comment Extension.
+        {
+            int len;
+            int ext = stbi__get8(s);
+            if (ext == 0xF9) { // Graphic Control Extension.
+                len = stbi__get8(s);
+                if (len == 4) {
+                    g->eflags = stbi__get8(s);
+                    g->delay = 10 * stbi__get16le(s); // delay - 1/100th of a second, saving as 1/1000ths.
+
+                    // unset old transparent
+                    if (g->transparent >= 0) {
+                        g->pal[g->transparent][3] = 255;
+                    }
+                    if (g->eflags & 0x01) {
+                        g->transparent = stbi__get8(s);
+                        if (g->transparent >= 0) {
+                            g->pal[g->transparent][3] = 0;
+                        }
+                    } else {
+                        // don't need transparent
+                        stbi__skip(s, 1);
+                        g->transparent = -1;
+                    }
+                } else {
+                    stbi__skip(s, len);
+                    break;
+                }
+            }
+            while ((len = stbi__get8(s)) != 0) {
+                stbi__skip(s, len);
+            }
+            break;
+        }
+
+        case 0x3B:               // gif stream termination code
+            return (stbi_uc *)s; // using '1' causes warning on some compilers
+
+        default:
+            return stbi__errpuc("unknown code", "Corrupt GIF");
+        }
+    }
+}
+
+static void * stbi__load_gif_main_outofmem(stbi__gif * g, stbi_uc * out, int ** delays) {
+    STBI_FREE(g->out);
+    STBI_FREE(g->history);
+    STBI_FREE(g->background);
+
+    if (out)
+        STBI_FREE(out);
+    if (delays && *delays)
+        STBI_FREE(*delays);
+    return stbi__errpuc("outofmem", "Out of memory");
+}
+
+static void * stbi__load_gif_main(stbi__context * s, int ** delays, int * x, int * y, int * z, int * comp, int req_comp) {
+    if (stbi__gif_test(s)) {
+        int layers = 0;
+        stbi_uc * u = 0;
+        stbi_uc * out = 0;
+        stbi_uc * two_back = 0;
+        stbi__gif g;
+        int stride;
+        int out_size = 0;
+        int delays_size = 0;
+
+        STBI_NOTUSED(out_size);
+        STBI_NOTUSED(delays_size);
+
+        memset(&g, 0, sizeof(g));
+        if (delays) {
+            *delays = 0;
+        }
+
+        do {
+            u = stbi__gif_load_next(s, &g, comp, req_comp, two_back);
+            if (u == (stbi_uc *)s)
+                u = 0; // end of animated gif marker
+
+            if (u) {
+                *x = g.w;
+                *y = g.h;
+                ++layers;
+                stride = g.w * g.h * 4;
+
+                if (out) {
+                    void * tmp = (stbi_uc *)STBI_REALLOC_SIZED(out, out_size, layers * stride);
+                    if (!tmp)
+                        return stbi__load_gif_main_outofmem(&g, out, delays);
+                    else {
+                        out = (stbi_uc *)tmp;
+                        out_size = layers * stride;
+                    }
+
+                    if (delays) {
+                        int * new_delays = (int *)STBI_REALLOC_SIZED(*delays, delays_size, sizeof(int) * layers);
+                        if (!new_delays)
+                            return stbi__load_gif_main_outofmem(&g, out, delays);
+                        *delays = new_delays;
+                        delays_size = layers * sizeof(int);
+                    }
+                } else {
+                    out = (stbi_uc *)stbi__malloc(layers * stride);
+                    if (!out)
+                        return stbi__load_gif_main_outofmem(&g, out, delays);
+                    out_size = layers * stride;
+                    if (delays) {
+                        *delays = (int *)stbi__malloc(layers * sizeof(int));
+                        if (!*delays)
+                            return stbi__load_gif_main_outofmem(&g, out, delays);
+                        delays_size = layers * sizeof(int);
+                    }
+                }
+                memcpy(out + ((layers - 1) * stride), u, stride);
+                if (layers >= 2) {
+                    two_back = out - 2 * stride;
+                }
+
+                if (delays) {
+                    (*delays)[layers - 1U] = g.delay;
+                }
+            }
+        } while (u != 0);
+
+        // free temp buffer;
+        STBI_FREE(g.out);
+        STBI_FREE(g.history);
+        STBI_FREE(g.background);
+
+        // do the final conversion after loading everything;
+        if (req_comp && req_comp != 4)
+            out = stbi__convert_format(out, 4, req_comp, layers * g.w, g.h);
+
+        *z = layers;
+        return out;
+    } else {
+        return stbi__errpuc("not GIF", "Image was not as a gif type.");
+    }
+}
+
+static void * stbi__gif_load(stbi__context * s, int * x, int * y, int * comp, int req_comp, stbi__result_info * ri) {
+    stbi_uc * u = 0;
+    stbi__gif g;
+    memset(&g, 0, sizeof(g));
+    STBI_NOTUSED(ri);
+
+    u = stbi__gif_load_next(s, &g, comp, req_comp, 0);
+    if (u == (stbi_uc *)s)
+        u = 0; // end of animated gif marker
+    if (u) {
+        *x = g.w;
+        *y = g.h;
+
+        // moved conversion to after successful load so that the same
+        // can be done for multiple frames.
+        if (req_comp && req_comp != 4)
+            u = stbi__convert_format(u, 4, req_comp, g.w, g.h);
+    } else if (g.out) {
+        // if there was an error and we allocated an image buffer, free it!
+        STBI_FREE(g.out);
+    }
+
+    // free buffers needed for multiple frame loading;
+    STBI_FREE(g.history);
+    STBI_FREE(g.background);
+
+    return u;
+}
+
+static int stbi__gif_info(stbi__context * s, int * x, int * y, int * comp) { return stbi__gif_info_raw(s, x, y, comp); }
+#endif
+
+// *************************************************************************************************
+// Radiance RGBE HDR loader
+// originally by Nicolas Schulz
+#ifndef STBI_NO_HDR
+static int stbi__hdr_test_core(stbi__context * s, const char * signature) {
+    int i;
+    for (i = 0; signature[i]; ++i)
+        if (stbi__get8(s) != signature[i])
+            return 0;
+    stbi__rewind(s);
+    return 1;
+}
+
+static int stbi__hdr_test(stbi__context * s) {
+    int r = stbi__hdr_test_core(s, "#?RADIANCE\n");
+    stbi__rewind(s);
+    if (!r) {
+        r = stbi__hdr_test_core(s, "#?RGBE\n");
+        stbi__rewind(s);
+    }
+    return r;
+}
+
+#define STBI__HDR_BUFLEN 1024
+static char * stbi__hdr_gettoken(stbi__context * z, char * buffer) {
+    int len = 0;
+    char c = '\0';
+
+    c = (char)stbi__get8(z);
+
+    while (!stbi__at_eof(z) && c != '\n') {
+        buffer[len++] = c;
+        if (len == STBI__HDR_BUFLEN - 1) {
+            // flush to end of line
+            while (!stbi__at_eof(z) && stbi__get8(z) != '\n')
+                ;
+            break;
+        }
+        c = (char)stbi__get8(z);
+    }
+
+    buffer[len] = 0;
+    return buffer;
+}
+
+static void stbi__hdr_convert(float * output, stbi_uc * input, int req_comp) {
+    if (input[3] != 0) {
+        float f1;
+        // Exponent
+        f1 = (float)ldexp(1.0f, input[3] - (int)(128 + 8));
+        if (req_comp <= 2)
+            output[0] = (input[0] + input[1] + input[2]) * f1 / 3;
+        else {
+            output[0] = input[0] * f1;
+            output[1] = input[1] * f1;
+            output[2] = input[2] * f1;
+        }
+        if (req_comp == 2)
+            output[1] = 1;
+        if (req_comp == 4)
+            output[3] = 1;
+    } else {
+        switch (req_comp) {
+        case 4:
+            output[3] = 1; /* fallthrough */
+        case 3:
+            output[0] = output[1] = output[2] = 0;
+            break;
+        case 2:
+            output[1] = 1; /* fallthrough */
+        case 1:
+            output[0] = 0;
+            break;
+        }
+    }
+}
+
+static float * stbi__hdr_load(stbi__context * s, int * x, int * y, int * comp, int req_comp, stbi__result_info * ri) {
+    char buffer[STBI__HDR_BUFLEN];
+    char * token;
+    int valid = 0;
+    int width, height;
+    stbi_uc * scanline;
+    float * hdr_data;
+    int len;
+    unsigned char count, value;
+    int i, j, k, c1, c2, z;
+    const char * headerToken;
+    STBI_NOTUSED(ri);
+
+    // Check identifier
+    headerToken = stbi__hdr_gettoken(s, buffer);
+    if (strcmp(headerToken, "#?RADIANCE") != 0 && strcmp(headerToken, "#?RGBE") != 0)
+        return stbi__errpf("not HDR", "Corrupt HDR image");
+
+    // Parse header
+    for (;;) {
+        token = stbi__hdr_gettoken(s, buffer);
+        if (token[0] == 0)
+            break;
+        if (strcmp(token, "FORMAT=32-bit_rle_rgbe") == 0)
+            valid = 1;
+    }
+
+    if (!valid)
+        return stbi__errpf("unsupported format", "Unsupported HDR format");
+
+    // Parse width and height
+    // can't use sscanf() if we're not using stdio!
+    token = stbi__hdr_gettoken(s, buffer);
+    if (strncmp(token, "-Y ", 3))
+        return stbi__errpf("unsupported data layout", "Unsupported HDR format");
+    token += 3;
+    height = (int)strtol(token, &token, 10);
+    while (*token == ' ')
+        ++token;
+    if (strncmp(token, "+X ", 3))
+        return stbi__errpf("unsupported data layout", "Unsupported HDR format");
+    token += 3;
+    width = (int)strtol(token, NULL, 10);
+
+    if (height > STBI_MAX_DIMENSIONS)
+        return stbi__errpf("too large", "Very large image (corrupt?)");
+    if (width > STBI_MAX_DIMENSIONS)
+        return stbi__errpf("too large", "Very large image (corrupt?)");
+
+    *x = width;
+    *y = height;
+
+    if (comp)
+        *comp = 3;
+    if (req_comp == 0)
+        req_comp = 3;
+
+    if (!stbi__mad4sizes_valid(width, height, req_comp, sizeof(float), 0))
+        return stbi__errpf("too large", "HDR image is too large");
+
+    // Read data
+    hdr_data = (float *)stbi__malloc_mad4(width, height, req_comp, sizeof(float), 0);
+    if (!hdr_data)
+        return stbi__errpf("outofmem", "Out of memory");
+
+    // Load image data
+    // image data is stored as some number of sca
+    if (width < 8 || width >= 32768) {
+        // Read flat data
+        for (j = 0; j < height; ++j) {
+            for (i = 0; i < width; ++i) {
+                stbi_uc rgbe[4];
+            main_decode_loop:
+                stbi__getn(s, rgbe, 4);
+                stbi__hdr_convert(hdr_data + j * width * req_comp + i * req_comp, rgbe, req_comp);
+            }
+        }
+    } else {
+        // Read RLE-encoded data
+        scanline = NULL;
+
+        for (j = 0; j < height; ++j) {
+            c1 = stbi__get8(s);
+            c2 = stbi__get8(s);
+            len = stbi__get8(s);
+            if (c1 != 2 || c2 != 2 || (len & 0x80)) {
+                // not run-length encoded, so we have to actually use THIS data as a decoded
+                // pixel (note this can't be a valid pixel--one of RGB must be >= 128)
+                stbi_uc rgbe[4];
+                rgbe[0] = (stbi_uc)c1;
+                rgbe[1] = (stbi_uc)c2;
+                rgbe[2] = (stbi_uc)len;
+                rgbe[3] = (stbi_uc)stbi__get8(s);
+                stbi__hdr_convert(hdr_data, rgbe, req_comp);
+                i = 1;
+                j = 0;
+                STBI_FREE(scanline);
+                goto main_decode_loop; // yes, this makes no sense
+            }
+            len <<= 8;
+            len |= stbi__get8(s);
+            if (len != width) {
+                STBI_FREE(hdr_data);
+                STBI_FREE(scanline);
+                return stbi__errpf("invalid decoded scanline length", "corrupt HDR");
+            }
+            if (scanline == NULL) {
+                scanline = (stbi_uc *)stbi__malloc_mad2(width, 4, 0);
+                if (!scanline) {
+                    STBI_FREE(hdr_data);
+                    return stbi__errpf("outofmem", "Out of memory");
+                }
+            }
+
+            for (k = 0; k < 4; ++k) {
+                int nleft;
+                i = 0;
+                while ((nleft = width - i) > 0) {
+                    count = stbi__get8(s);
+                    if (count > 128) {
+                        // Run
+                        value = stbi__get8(s);
+                        count -= 128;
+                        if ((count == 0) || (count > nleft)) {
+                            STBI_FREE(hdr_data);
+                            STBI_FREE(scanline);
+                            return stbi__errpf("corrupt", "bad RLE data in HDR");
+                        }
+                        for (z = 0; z < count; ++z)
+                            scanline[i++ * 4 + k] = value;
+                    } else {
+                        // Dump
+                        if ((count == 0) || (count > nleft)) {
+                            STBI_FREE(hdr_data);
+                            STBI_FREE(scanline);
+                            return stbi__errpf("corrupt", "bad RLE data in HDR");
+                        }
+                        for (z = 0; z < count; ++z)
+                            scanline[i++ * 4 + k] = stbi__get8(s);
+                    }
+                }
+            }
+            for (i = 0; i < width; ++i)
+                stbi__hdr_convert(hdr_data + (j * width + i) * req_comp, scanline + i * 4, req_comp);
+        }
+        if (scanline)
+            STBI_FREE(scanline);
+    }
+
+    return hdr_data;
+}
+
+static int stbi__hdr_info(stbi__context * s, int * x, int * y, int * comp) {
+    char buffer[STBI__HDR_BUFLEN];
+    char * token;
+    int valid = 0;
+    int dummy;
+
+    if (!x)
+        x = &dummy;
+    if (!y)
+        y = &dummy;
+    if (!comp)
+        comp = &dummy;
+
+    if (stbi__hdr_test(s) == 0) {
+        stbi__rewind(s);
+        return 0;
+    }
+
+    for (;;) {
+        token = stbi__hdr_gettoken(s, buffer);
+        if (token[0] == 0)
+            break;
+        if (strcmp(token, "FORMAT=32-bit_rle_rgbe") == 0)
+            valid = 1;
+    }
+
+    if (!valid) {
+        stbi__rewind(s);
+        return 0;
+    }
+    token = stbi__hdr_gettoken(s, buffer);
+    if (strncmp(token, "-Y ", 3)) {
+        stbi__rewind(s);
+        return 0;
+    }
+    token += 3;
+    *y = (int)strtol(token, &token, 10);
+    while (*token == ' ')
+        ++token;
+    if (strncmp(token, "+X ", 3)) {
+        stbi__rewind(s);
+        return 0;
+    }
+    token += 3;
+    *x = (int)strtol(token, NULL, 10);
+    *comp = 3;
+    return 1;
+}
+#endif // STBI_NO_HDR
+
+#ifndef STBI_NO_BMP
+static int stbi__bmp_info(stbi__context * s, int * x, int * y, int * comp) {
+    void * p;
+    stbi__bmp_data info;
+
+    info.all_a = 255;
+    p = stbi__bmp_parse_header(s, &info);
+    if (p == NULL) {
+        stbi__rewind(s);
+        return 0;
+    }
+    if (x)
+        *x = s->img_x;
+    if (y)
+        *y = s->img_y;
+    if (comp) {
+        if (info.bpp == 24 && info.ma == 0xff000000)
+            *comp = 3;
+        else
+            *comp = info.ma ? 4 : 3;
+    }
+    return 1;
+}
+#endif
+
+#ifndef STBI_NO_PSD
+static int stbi__psd_info(stbi__context * s, int * x, int * y, int * comp) {
+    int channelCount, dummy, depth;
+    if (!x)
+        x = &dummy;
+    if (!y)
+        y = &dummy;
+    if (!comp)
+        comp = &dummy;
+    if (stbi__get32be(s) != 0x38425053) {
+        stbi__rewind(s);
+        return 0;
+    }
+    if (stbi__get16be(s) != 1) {
+        stbi__rewind(s);
+        return 0;
+    }
+    stbi__skip(s, 6);
+    channelCount = stbi__get16be(s);
+    if (channelCount < 0 || channelCount > 16) {
+        stbi__rewind(s);
+        return 0;
+    }
+    *y = stbi__get32be(s);
+    *x = stbi__get32be(s);
+    depth = stbi__get16be(s);
+    if (depth != 8 && depth != 16) {
+        stbi__rewind(s);
+        return 0;
+    }
+    if (stbi__get16be(s) != 3) {
+        stbi__rewind(s);
+        return 0;
+    }
+    *comp = 4;
+    return 1;
+}
+
+static int stbi__psd_is16(stbi__context * s) {
+    int channelCount, depth;
+    if (stbi__get32be(s) != 0x38425053) {
+        stbi__rewind(s);
+        return 0;
+    }
+    if (stbi__get16be(s) != 1) {
+        stbi__rewind(s);
+        return 0;
+    }
+    stbi__skip(s, 6);
+    channelCount = stbi__get16be(s);
+    if (channelCount < 0 || channelCount > 16) {
+        stbi__rewind(s);
+        return 0;
+    }
+    STBI_NOTUSED(stbi__get32be(s));
+    STBI_NOTUSED(stbi__get32be(s));
+    depth = stbi__get16be(s);
+    if (depth != 16) {
+        stbi__rewind(s);
+        return 0;
+    }
+    return 1;
+}
+#endif
+
+#ifndef STBI_NO_PIC
+static int stbi__pic_info(stbi__context * s, int * x, int * y, int * comp) {
+    int act_comp = 0, num_packets = 0, chained, dummy;
+    stbi__pic_packet packets[10];
+
+    if (!x)
+        x = &dummy;
+    if (!y)
+        y = &dummy;
+    if (!comp)
+        comp = &dummy;
+
+    if (!stbi__pic_is4(s, "\x53\x80\xF6\x34")) {
+        stbi__rewind(s);
+        return 0;
+    }
+
+    stbi__skip(s, 88);
+
+    *x = stbi__get16be(s);
+    *y = stbi__get16be(s);
+    if (stbi__at_eof(s)) {
+        stbi__rewind(s);
+        return 0;
+    }
+    if ((*x) != 0 && (1 << 28) / (*x) < (*y)) {
+        stbi__rewind(s);
+        return 0;
+    }
+
+    stbi__skip(s, 8);
+
+    do {
+        stbi__pic_packet * packet;
+
+        if (num_packets == sizeof(packets) / sizeof(packets[0]))
+            return 0;
+
+        packet = &packets[num_packets++];
+        chained = stbi__get8(s);
+        packet->size = stbi__get8(s);
+        packet->type = stbi__get8(s);
+        packet->channel = stbi__get8(s);
+        act_comp |= packet->channel;
+
+        if (stbi__at_eof(s)) {
+            stbi__rewind(s);
+            return 0;
+        }
+        if (packet->size != 8) {
+            stbi__rewind(s);
+            return 0;
+        }
+    } while (chained);
+
+    *comp = (act_comp & 0x10 ? 4 : 3);
+
+    return 1;
+}
+#endif
+
+// *************************************************************************************************
+// Portable Gray Map and Portable Pixel Map loader
+// by Ken Miller
+//
+// PGM: http://netpbm.sourceforge.net/doc/pgm.html
+// PPM: http://netpbm.sourceforge.net/doc/ppm.html
+//
+// Known limitations:
+//    Does not support comments in the header section
+//    Does not support ASCII image data (formats P2 and P3)
+
+#ifndef STBI_NO_PNM
+
+static int stbi__pnm_test(stbi__context * s) {
+    char p, t;
+    p = (char)stbi__get8(s);
+    t = (char)stbi__get8(s);
+    if (p != 'P' || (t != '5' && t != '6')) {
+        stbi__rewind(s);
+        return 0;
+    }
+    return 1;
+}
+
+static void * stbi__pnm_load(stbi__context * s, int * x, int * y, int * comp, int req_comp, stbi__result_info * ri) {
+    stbi_uc * out;
+    STBI_NOTUSED(ri);
+
+    ri->bits_per_channel = stbi__pnm_info(s, (int *)&s->img_x, (int *)&s->img_y, (int *)&s->img_n);
+    if (ri->bits_per_channel == 0)
+        return 0;
+
+    if (s->img_y > STBI_MAX_DIMENSIONS)
+        return stbi__errpuc("too large", "Very large image (corrupt?)");
+    if (s->img_x > STBI_MAX_DIMENSIONS)
+        return stbi__errpuc("too large", "Very large image (corrupt?)");
+
+    *x = s->img_x;
+    *y = s->img_y;
+    if (comp)
+        *comp = s->img_n;
+
+    if (!stbi__mad4sizes_valid(s->img_n, s->img_x, s->img_y, ri->bits_per_channel / 8, 0))
+        return stbi__errpuc("too large", "PNM too large");
+
+    out = (stbi_uc *)stbi__malloc_mad4(s->img_n, s->img_x, s->img_y, ri->bits_per_channel / 8, 0);
+    if (!out)
+        return stbi__errpuc("outofmem", "Out of memory");
+    if (!stbi__getn(s, out, s->img_n * s->img_x * s->img_y * (ri->bits_per_channel / 8))) {
+        STBI_FREE(out);
+        return stbi__errpuc("bad PNM", "PNM file truncated");
+    }
+
+    if (req_comp && req_comp != s->img_n) {
+        if (ri->bits_per_channel == 16) {
+            out = (stbi_uc *)stbi__convert_format16((stbi__uint16 *)out, s->img_n, req_comp, s->img_x, s->img_y);
+        } else {
+            out = stbi__convert_format(out, s->img_n, req_comp, s->img_x, s->img_y);
+        }
+        if (out == NULL)
+            return out; // stbi__convert_format frees input on failure
+    }
+    return out;
+}
+
+static int stbi__pnm_isspace(char c) { return c == ' ' || c == '\t' || c == '\n' || c == '\v' || c == '\f' || c == '\r'; }
+
+static void stbi__pnm_skip_whitespace(stbi__context * s, char * c) {
+    for (;;) {
+        while (!stbi__at_eof(s) && stbi__pnm_isspace(*c))
+            *c = (char)stbi__get8(s);
+
+        if (stbi__at_eof(s) || *c != '#')
+            break;
+
+        while (!stbi__at_eof(s) && *c != '\n' && *c != '\r')
+            *c = (char)stbi__get8(s);
+    }
+}
+
+static int stbi__pnm_isdigit(char c) { return c >= '0' && c <= '9'; }
+
+static int stbi__pnm_getinteger(stbi__context * s, char * c) {
+    int value = 0;
+
+    while (!stbi__at_eof(s) && stbi__pnm_isdigit(*c)) {
+        value = value * 10 + (*c - '0');
+        *c = (char)stbi__get8(s);
+        if ((value > 214748364) || (value == 214748364 && *c > '7'))
+            return stbi__err("integer parse overflow", "Parsing an integer in the PPM header overflowed a 32-bit int");
+    }
+
+    return value;
+}
+
+static int stbi__pnm_info(stbi__context * s, int * x, int * y, int * comp) {
+    int maxv, dummy;
+    char c, p, t;
+
+    if (!x)
+        x = &dummy;
+    if (!y)
+        y = &dummy;
+    if (!comp)
+        comp = &dummy;
+
+    stbi__rewind(s);
+
+    // Get identifier
+    p = (char)stbi__get8(s);
+    t = (char)stbi__get8(s);
+    if (p != 'P' || (t != '5' && t != '6')) {
+        stbi__rewind(s);
+        return 0;
+    }
+
+    *comp = (t == '6') ? 3 : 1; // '5' is 1-component .pgm; '6' is 3-component .ppm
+
+    c = (char)stbi__get8(s);
+    stbi__pnm_skip_whitespace(s, &c);
+
+    *x = stbi__pnm_getinteger(s, &c); // read width
+    if (*x == 0)
+        return stbi__err("invalid width", "PPM image header had zero or overflowing width");
+    stbi__pnm_skip_whitespace(s, &c);
+
+    *y = stbi__pnm_getinteger(s, &c); // read height
+    if (*y == 0)
+        return stbi__err("invalid width", "PPM image header had zero or overflowing width");
+    stbi__pnm_skip_whitespace(s, &c);
+
+    maxv = stbi__pnm_getinteger(s, &c); // read max value
+    if (maxv > 65535)
+        return stbi__err("max value > 65535", "PPM image supports only 8-bit and 16-bit images");
+    else if (maxv > 255)
+        return 16;
+    else
+        return 8;
+}
+
+static int stbi__pnm_is16(stbi__context * s) {
+    if (stbi__pnm_info(s, NULL, NULL, NULL) == 16)
+        return 1;
+    return 0;
+}
+#endif
+
+static int stbi__info_main(stbi__context * s, int * x, int * y, int * comp) {
+#ifndef STBI_NO_JPEG
+    if (stbi__jpeg_info(s, x, y, comp))
+        return 1;
+#endif
+
+#ifndef STBI_NO_PNG
+    if (stbi__png_info(s, x, y, comp))
+        return 1;
+#endif
+
+#ifndef STBI_NO_GIF
+    if (stbi__gif_info(s, x, y, comp))
+        return 1;
+#endif
+
+#ifndef STBI_NO_BMP
+    if (stbi__bmp_info(s, x, y, comp))
+        return 1;
+#endif
+
+#ifndef STBI_NO_PSD
+    if (stbi__psd_info(s, x, y, comp))
+        return 1;
+#endif
+
+#ifndef STBI_NO_PIC
+    if (stbi__pic_info(s, x, y, comp))
+        return 1;
+#endif
+
+#ifndef STBI_NO_PNM
+    if (stbi__pnm_info(s, x, y, comp))
+        return 1;
+#endif
+
+#ifndef STBI_NO_HDR
+    if (stbi__hdr_info(s, x, y, comp))
+        return 1;
+#endif
+
+// test tga last because it's a crappy test!
+#ifndef STBI_NO_TGA
+    if (stbi__tga_info(s, x, y, comp))
+        return 1;
+#endif
+    return stbi__err("unknown image type", "Image not of any known type, or corrupt");
+}
+
+static int stbi__is_16_main(stbi__context * s) {
+#ifndef STBI_NO_PNG
+    if (stbi__png_is16(s))
+        return 1;
+#endif
+
+#ifndef STBI_NO_PSD
+    if (stbi__psd_is16(s))
+        return 1;
+#endif
+
+#ifndef STBI_NO_PNM
+    if (stbi__pnm_is16(s))
+        return 1;
+#endif
+    return 0;
+}
+
+#ifndef STBI_NO_STDIO
+STBIDEF int stbi_info(char const * filename, int * x, int * y, int * comp) {
+    FILE * f = stbi__fopen(filename, "rb");
+    int result;
+    if (!f)
+        return stbi__err("can't fopen", "Unable to open file");
+    result = stbi_info_from_file(f, x, y, comp);
+    fclose(f);
+    return result;
+}
+
+STBIDEF int stbi_info_from_file(FILE * f, int * x, int * y, int * comp) {
+    int r;
+    stbi__context s;
+    long pos = ftell(f);
+    stbi__start_file(&s, f);
+    r = stbi__info_main(&s, x, y, comp);
+    fseek(f, pos, SEEK_SET);
+    return r;
+}
+
+STBIDEF int stbi_is_16_bit(char const * filename) {
+    FILE * f = stbi__fopen(filename, "rb");
+    int result;
+    if (!f)
+        return stbi__err("can't fopen", "Unable to open file");
+    result = stbi_is_16_bit_from_file(f);
+    fclose(f);
+    return result;
+}
+
+STBIDEF int stbi_is_16_bit_from_file(FILE * f) {
+    int r;
+    stbi__context s;
+    long pos = ftell(f);
+    stbi__start_file(&s, f);
+    r = stbi__is_16_main(&s);
+    fseek(f, pos, SEEK_SET);
+    return r;
+}
+#endif // !STBI_NO_STDIO
+
+STBIDEF int stbi_info_from_memory(stbi_uc const * buffer, int len, int * x, int * y, int * comp) {
+    stbi__context s;
+    stbi__start_mem(&s, buffer, len);
+    return stbi__info_main(&s, x, y, comp);
+}
+
+STBIDEF int stbi_info_from_callbacks(stbi_io_callbacks const * c, void * user, int * x, int * y, int * comp) {
+    stbi__context s;
+    stbi__start_callbacks(&s, (stbi_io_callbacks *)c, user);
+    return stbi__info_main(&s, x, y, comp);
+}
+
+STBIDEF int stbi_is_16_bit_from_memory(stbi_uc const * buffer, int len) {
+    stbi__context s;
+    stbi__start_mem(&s, buffer, len);
+    return stbi__is_16_main(&s);
+}
+
+STBIDEF int stbi_is_16_bit_from_callbacks(stbi_io_callbacks const * c, void * user) {
+    stbi__context s;
+    stbi__start_callbacks(&s, (stbi_io_callbacks *)c, user);
+    return stbi__is_16_main(&s);
+}
+
+#endif // STB_IMAGE_IMPLEMENTATION
+
+/*
+   revision history:
+      2.20  (2019-02-07) support utf8 filenames in Windows; fix warnings and platform ifdefs
+      2.19  (2018-02-11) fix warning
+      2.18  (2018-01-30) fix warnings
+      2.17  (2018-01-29) change sbti__shiftsigned to avoid clang -O2 bug
+                         1-bit BMP
+                         *_is_16_bit api
+                         avoid warnings
+      2.16  (2017-07-23) all functions have 16-bit variants;
+                         STBI_NO_STDIO works again;
+                         compilation fixes;
+                         fix rounding in unpremultiply;
+                         optimize vertical flip;
+                         disable raw_len validation;
+                         documentation fixes
+      2.15  (2017-03-18) fix png-1,2,4 bug; now all Imagenet JPGs decode;
+                         warning fixes; disable run-time SSE detection on gcc;
+                         uniform handling of optional "return" values;
+                         thread-safe initialization of zlib tables
+      2.14  (2017-03-03) remove deprecated STBI_JPEG_OLD; fixes for Imagenet JPGs
+      2.13  (2016-11-29) add 16-bit API, only supported for PNG right now
+      2.12  (2016-04-02) fix typo in 2.11 PSD fix that caused crashes
+      2.11  (2016-04-02) allocate large structures on the stack
+                         remove white matting for transparent PSD
+                         fix reported channel count for PNG & BMP
+                         re-enable SSE2 in non-gcc 64-bit
+                         support RGB-formatted JPEG
+                         read 16-bit PNGs (only as 8-bit)
+      2.10  (2016-01-22) avoid warning introduced in 2.09 by STBI_REALLOC_SIZED
+      2.09  (2016-01-16) allow comments in PNM files
+                         16-bit-per-pixel TGA (not bit-per-component)
+                         info() for TGA could break due to .hdr handling
+                         info() for BMP to shares code instead of sloppy parse
+                         can use STBI_REALLOC_SIZED if allocator doesn't support realloc
+                         code cleanup
+      2.08  (2015-09-13) fix to 2.07 cleanup, reading RGB PSD as RGBA
+      2.07  (2015-09-13) fix compiler warnings
+                         partial animated GIF support
+                         limited 16-bpc PSD support
+                         #ifdef unused functions
+                         bug with < 92 byte PIC,PNM,HDR,TGA
+      2.06  (2015-04-19) fix bug where PSD returns wrong '*comp' value
+      2.05  (2015-04-19) fix bug in progressive JPEG handling, fix warning
+      2.04  (2015-04-15) try to re-enable SIMD on MinGW 64-bit
+      2.03  (2015-04-12) extra corruption checking (mmozeiko)
+                         stbi_set_flip_vertically_on_load (nguillemot)
+                         fix NEON support; fix mingw support
+      2.02  (2015-01-19) fix incorrect assert, fix warning
+      2.01  (2015-01-17) fix various warnings; suppress SIMD on gcc 32-bit without -msse2
+      2.00b (2014-12-25) fix STBI_MALLOC in progressive JPEG
+      2.00  (2014-12-25) optimize JPG, including x86 SSE2 & NEON SIMD (ryg)
+                         progressive JPEG (stb)
+                         PGM/PPM support (Ken Miller)
+                         STBI_MALLOC,STBI_REALLOC,STBI_FREE
+                         GIF bugfix -- seemingly never worked
+                         STBI_NO_*, STBI_ONLY_*
+      1.48  (2014-12-14) fix incorrectly-named assert()
+      1.47  (2014-12-14) 1/2/4-bit PNG support, both direct and paletted (Omar Cornut & stb)
+                         optimize PNG (ryg)
+                         fix bug in interlaced PNG with user-specified channel count (stb)
+      1.46  (2014-08-26)
+              fix broken tRNS chunk (colorkey-style transparency) in non-paletted PNG
+      1.45  (2014-08-16)
+              fix MSVC-ARM internal compiler error by wrapping malloc
+      1.44  (2014-08-07)
+              various warning fixes from Ronny Chevalier
+      1.43  (2014-07-15)
+              fix MSVC-only compiler problem in code changed in 1.42
+      1.42  (2014-07-09)
+              don't define _CRT_SECURE_NO_WARNINGS (affects user code)
+              fixes to stbi__cleanup_jpeg path
+              added STBI_ASSERT to avoid requiring assert.h
+      1.41  (2014-06-25)
+              fix search&replace from 1.36 that messed up comments/error messages
+      1.40  (2014-06-22)
+              fix gcc struct-initialization warning
+      1.39  (2014-06-15)
+              fix to TGA optimization when req_comp != number of components in TGA;
+              fix to GIF loading because BMP wasn't rewinding (whoops, no GIFs in my test suite)
+              add support for BMP version 5 (more ignored fields)
+      1.38  (2014-06-06)
+              suppress MSVC warnings on integer casts truncating values
+              fix accidental rename of 'skip' field of I/O
+      1.37  (2014-06-04)
+              remove duplicate typedef
+      1.36  (2014-06-03)
+              convert to header file single-file library
+              if de-iphone isn't set, load iphone images color-swapped instead of returning NULL
+      1.35  (2014-05-27)
+              various warnings
+              fix broken STBI_SIMD path
+              fix bug where stbi_load_from_file no longer left file pointer in correct place
+              fix broken non-easy path for 32-bit BMP (possibly never used)
+              TGA optimization by Arseny Kapoulkine
+      1.34  (unknown)
+              use STBI_NOTUSED in stbi__resample_row_generic(), fix one more leak in tga failure case
+      1.33  (2011-07-14)
+              make stbi_is_hdr work in STBI_NO_HDR (as specified), minor compiler-friendly improvements
+      1.32  (2011-07-13)
+              support for "info" function for all supported filetypes (SpartanJ)
+      1.31  (2011-06-20)
+              a few more leak fixes, bug in PNG handling (SpartanJ)
+      1.30  (2011-06-11)
+              added ability to load files via callbacks to accomidate custom input streams (Ben Wenger)
+              removed deprecated format-specific test/load functions
+              removed support for installable file formats (stbi_loader) -- would have been broken for IO callbacks
+   anyway error cases in bmp and tga give messages and don't leak (Raymond Barbiero, grisha) fix inefficiency in
+   decoding 32-bit BMP (David Woo) 1.29  (2010-08-16) various warning fixes from Aurelien Pocheville 1.28  (2010-08-01)
+              fix bug in GIF palette transparency (SpartanJ)
+      1.27  (2010-08-01)
+              cast-to-stbi_uc to fix warnings
+      1.26  (2010-07-24)
+              fix bug in file buffering for PNG reported by SpartanJ
+      1.25  (2010-07-17)
+              refix trans_data warning (Won Chun)
+      1.24  (2010-07-12)
+              perf improvements reading from files on platforms with lock-heavy fgetc()
+              minor perf improvements for jpeg
+              deprecated type-specific functions so we'll get feedback if they're needed
+              attempt to fix trans_data warning (Won Chun)
+      1.23    fixed bug in iPhone support
+      1.22  (2010-07-10)
+              removed image *writing* support
+              stbi_info support from Jetro Lauha
+              GIF support from Jean-Marc Lienher
+              iPhone PNG-extensions from James Brown
+              warning-fixes from Nicolas Schulz and Janez Zemva (i.stbi__err. Janez (U+017D)emva)
+      1.21    fix use of 'stbi_uc' in header (reported by jon blow)
+      1.20    added support for Softimage PIC, by Tom Seddon
+      1.19    bug in interlaced PNG corruption check (found by ryg)
+      1.18  (2008-08-02)
+              fix a threading bug (local mutable static)
+      1.17    support interlaced PNG
+      1.16    major bugfix - stbi__convert_format converted one too many pixels
+      1.15    initialize some fields for thread safety
+      1.14    fix threadsafe conversion bug
+              header-file-only version (#define STBI_HEADER_FILE_ONLY before including)
+      1.13    threadsafe
+      1.12    const qualifiers in the API
+      1.11    Support installable IDCT, colorspace conversion routines
+      1.10    Fixes for 64-bit (don't use "unsigned long")
+              optimized upsampling by Fabian "ryg" Giesen
+      1.09    Fix format-conversion for PSD code (bad global variables!)
+      1.08    Thatcher Ulrich's PSD code integrated by Nicolas Schulz
+      1.07    attempt to fix C++ warning/errors again
+      1.06    attempt to fix C++ warning/errors again
+      1.05    fix TGA loading to return correct *comp and use good luminance calc
+      1.04    default float alpha is 1, not 255; use 'void *' for stbi_image_free
+      1.03    bugfixes to STBI_NO_STDIO, STBI_NO_HDR
+      1.02    support for (subset of) HDR files, float interface for preferred access to them
+      1.01    fix bug: possible bug in handling right-side up bmps... not sure
+              fix bug: the stbi__bmp_load() and stbi__tga_load() functions didn't work at all
+      1.00    interface to zlib that skips zlib header
+      0.99    correct handling of alpha in palette
+      0.98    TGA loader by lonesock; dynamically add loaders (untested)
+      0.97    jpeg errors on too large a file; also catch another malloc failure
+      0.96    fix detection of invalid v value - particleman@mollyrocket forum
+      0.95    during header scan, seek to markers in case of padding
+      0.94    STBI_NO_STDIO to disable stdio usage; rename all #defines the same
+      0.93    handle jpegtran output; verbose errors
+      0.92    read 4,8,16,24,32-bit BMP files of several formats
+      0.91    output 24-bit Windows 3.0 BMP files
+      0.90    fix a few more warnings; bump version number to approach 1.0
+      0.61    bugfixes due to Marc LeBlanc, Christopher Lloyd
+      0.60    fix compiling as c++
+      0.59    fix warnings: merge Dave Moore's -Wall fixes
+      0.58    fix bug: zlib uncompressed mode len/nlen was wrong endian
+      0.57    fix bug: jpg last huffman symbol before marker was >9 bits but less than 16 available
+      0.56    fix bug: zlib uncompressed mode len vs. nlen
+      0.55    fix bug: restart_interval not initialized to 0
+      0.54    allow NULL for 'int *comp'
+      0.53    fix bug in png 3->4; speedup png decoding
+      0.52    png handles req_comp=3,4 directly; minor cleanup; jpeg comments
+      0.51    obey req_comp requests, 1-component jpegs return as 1-component,
+              on 'test' only check type, not whether we support this variant
+      0.50  (2006-11-19)
+              first released version
+*/
+
+/*
+------------------------------------------------------------------------------
+This software is available under 2 licenses -- choose whichever you prefer.
+------------------------------------------------------------------------------
+ALTERNATIVE A - MIT License
+Copyright (c) 2017 Sean Barrett
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal in
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+of the Software, and to permit persons to whom the Software is furnished to do
+so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+------------------------------------------------------------------------------
+ALTERNATIVE B - Public Domain (www.unlicense.org)
+This is free and unencumbered software released into the public domain.
+Anyone is free to copy, modify, publish, use, compile, sell, or distribute this
+software, either in source code form or as a compiled binary, for any purpose,
+commercial or non-commercial, and by any means.
+In jurisdictions that recognize copyright laws, the author or authors of this
+software dedicate any and all copyright interest in the software to the public
+domain. We make this dedication for the benefit of the public at large and to
+the detriment of our heirs and successors. We intend this dedication to be an
+overt act of relinquishment in perpetuity of all present and future rights to
+this software under copyright law.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+------------------------------------------------------------------------------
+*/
diff --git a/common/train.cpp b/common/train.cpp
index 35a4cf9e6..bc15b7a03 100644
--- a/common/train.cpp
+++ b/common/train.cpp
@@ -236,8 +236,8 @@ int64_t get_example_targets_batch(
     int64_t used_samples = 0;
 
     ggml_set_f32(target_probs, 0.0f);
-    llama_token bos = llama_token_bos(lctx);
-    llama_token eos = llama_token_eos(lctx);
+    llama_token bos = llama_token_bos(llama_get_model(lctx));
+    llama_token eos = llama_token_eos(llama_get_model(lctx));
     // printf("%s: example_id=%d n_batch=%d n_train_samples=%zu\n", __func__, example_id, n_batch, n_train_samples);
     for (int k=0; k<n_batch; ++k) {
         // printf("%s: batch %d\n", __func__, k);
@@ -863,7 +863,7 @@ size_t tokenize_file(
             (int) buf.size(),
             out_tokens.data(),
             (int) out_tokens.size(),
-            false);
+            false, false);
         if (n_tokens < 0) {
             out_tokens.resize(-n_tokens);
             n_tokens = llama_tokenize(
@@ -872,7 +872,7 @@ size_t tokenize_file(
                 (int) buf.size(),
                 out_tokens.data(),
                 (int) out_tokens.size(),
-                false);
+                false, false);
         }
         if (n_tokens >= 0) {
             out_tokens.resize(n_tokens);
@@ -924,7 +924,7 @@ size_t tokenize_file(
         for (llama_token token=0; token < n_vocab; ++token) {
             max_token_text_size = std::max(
                 max_token_text_size,
-                strlen(llama_token_get_text(lctx, token)));
+                strlen(llama_token_get_text(llama_get_model(lctx), token)));
         }
 
         // upper bound of context byte length.
@@ -966,7 +966,7 @@ size_t tokenize_file(
                     (int) buf_sample.size(),
                     tok_sample.data(),
                     (int) tok_sample.size(),
-                    false);
+                    false, false);
                 if (n_tokens < 0) {
                     tok_sample.resize(-n_tokens);
                     n_tokens = llama_tokenize(llama_get_model(lctx),
@@ -974,7 +974,7 @@ size_t tokenize_file(
                         (int) buf_sample.size(),
                         tok_sample.data(),
                         (int) tok_sample.size(),
-                        false);
+                        false, false);
                     GGML_ASSERT(n_tokens >= 0);
                 }
                 GGML_ASSERT(n_tokens <= (int) tok_sample.size());
@@ -1045,6 +1045,7 @@ struct train_params_common get_default_train_params_common() {
     params.n_batch    =    8;
     params.n_gradient_accumulation = 1;
     params.n_epochs   = -1;
+    params.n_gpu_layers = 0;
 
     params.custom_n_ctx = false;
 
@@ -1080,6 +1081,7 @@ struct train_params_common get_default_train_params_common() {
     params.adam_beta2          = 0.999f;
     params.adam_gclip          = 1.0f;
     params.adam_eps_f          = 0.0f;
+
     return params;
 }
 
@@ -1425,7 +1427,7 @@ void train_opt_callback(void * vdata, int accum_step, float * sched, bool * canc
 
         int impr_plot = -(int)(1 + (opt->loss_before - opt->loss_after) * 10.0f + 0.5f);
         if (impr_plot > 0) impr_plot = 0;
-        if (std::isnan(opt->loss_before) || std::isnan(opt->loss_before)) impr_plot = 0;
+        if (std::isnan(opt->loss_before) || std::isnan(opt->loss_after)) impr_plot = 0;
         printf("%s: iter=%6d sample=%zu/%zu sched=%f loss=%f",
             __func__, opt->iter, std::min(1+train->shuffle_next_sample, train->shuffle_sample_count), train->shuffle_sample_count,
             *sched, opt->loss_after);
diff --git a/common/train.h b/common/train.h
index 42fa704b8..d86c93cc4 100644
--- a/common/train.h
+++ b/common/train.h
@@ -44,6 +44,7 @@ struct train_params_common {
     int n_batch;
     int n_gradient_accumulation;
     int n_epochs;
+    int n_gpu_layers;
 
     bool custom_n_ctx;
 
diff --git a/convert-baichuan-hf-to-gguf.py b/convert-baichuan-hf-to-gguf.py
index 513a7516a..67ccbe99f 100755
--- a/convert-baichuan-hf-to-gguf.py
+++ b/convert-baichuan-hf-to-gguf.py
@@ -76,6 +76,7 @@ def parse_args() -> argparse.Namespace:
         "ftype", type=int, choices=[0, 1], default=1, nargs='?',
         help="output format - use 0 for float32, 1 for float16",
     )
+    parser.add_argument("--bigendian",   action="store_true",    help="model is executed on big endian machine")
     return parser.parse_args()
 
 args = parse_args()
@@ -86,6 +87,11 @@ if not dir_model.is_dir():
     print(f'Error: {args.model} is not a directory', file = sys.stderr)
     sys.exit(1)
 
+endianess = gguf.GGUFEndian.LITTLE
+if args.bigendian:
+    endianess = gguf.GGUFEndian.BIG
+endianess_str = "Big Endian" if args.bigendian else "Little Endian"
+print(f"gguf: Conversion Endianess {endianess}")
 # possible tensor data types
 #   ftype == 0 -> float32
 #   ftype == 1 -> float16
@@ -104,7 +110,7 @@ print("gguf: loading model "+dir_model.name)
 with open(dir_model / "config.json", "r", encoding="utf-8") as f:
     hparams = json.load(f)
 print("hello print: ",hparams["architectures"][0])
-if hparams["architectures"][0] != "BaichuanForCausalLM":
+if hparams["architectures"][0] != "BaichuanForCausalLM" and hparams["architectures"][0] != "BaiChuanForCausalLM":
     print("Model architecture not supported: " + hparams["architectures"][0])
 
     sys.exit()
@@ -113,7 +119,7 @@ if hparams["architectures"][0] != "BaichuanForCausalLM":
 num_parts = count_model_parts(dir_model)
 print(f"num_parts:{num_parts}\n")
 ARCH=gguf.MODEL_ARCH.BAICHUAN
-gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH])
+gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH], endianess=endianess)
 
 print("gguf: get model metadata")
 
@@ -157,7 +163,8 @@ gguf_writer.add_layer_norm_rms_eps(hparams["rms_norm_eps"])
 if "rope_scaling" in hparams and hparams["rope_scaling"] != None and "factor" in hparams["rope_scaling"]:
     if "type" in hparams["rope_scaling"]:
         if hparams["rope_scaling"]["type"] == "linear":
-            gguf_writer.add_rope_scale_linear(hparams["rope_scaling"]["factor"])
+            gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
+            gguf_writer.add_rope_scaling_factor(hparams["rope_scaling"]["factor"])
 
 
 # TOKENIZATION
@@ -224,7 +231,7 @@ gguf_writer.add_token_list(tokens)
 gguf_writer.add_token_scores(scores)
 gguf_writer.add_token_types(toktypes)
 
-special_vocab = gguf.SpecialVocab(dir_model)
+special_vocab = gguf.SpecialVocab(dir_model, n_vocab = len(tokens))
 special_vocab.add_to_gguf(gguf_writer)
 
 # TENSORS
diff --git a/convert-bloom-hf-to-gguf.py b/convert-bloom-hf-to-gguf.py
index 7bfc95ec1..6e866d943 100755
--- a/convert-bloom-hf-to-gguf.py
+++ b/convert-bloom-hf-to-gguf.py
@@ -118,18 +118,27 @@ tokenizer = AutoTokenizer.from_pretrained(dir_model)
 vocab_size = hparams.get("vocab_size", len(tokenizer.vocab))
 assert max(tokenizer.vocab.values()) < vocab_size
 
+added_vocab = tokenizer.get_added_vocab()
 reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()}
 
 for i in range(vocab_size):
-    tokens.append(reverse_vocab[i] if i in reverse_vocab else f"[PAD{i}]")
-    scores.append(0.0)  # dummy
-    toktypes.append(gguf.TokenType.NORMAL)
+    if i not in reverse_vocab:
+        tokens.append(f"[PAD{i}]")
+        toktypes.append(gguf.TokenType.USER_DEFINED)
+    elif reverse_vocab[i] in added_vocab:
+        tokens.append(reverse_vocab[i])
+        if tokenizer.added_tokens_decoder[i].special:
+            toktypes.append(gguf.TokenType.CONTROL)
+        else:
+            toktypes.append(gguf.TokenType.USER_DEFINED)
+    else:
+        tokens.append(reverse_vocab[i])
+        toktypes.append(gguf.TokenType.NORMAL)
 
 gguf_writer.add_token_list(tokens)
-gguf_writer.add_token_scores(scores)
 gguf_writer.add_token_types(toktypes)
 
-special_vocab = gguf.SpecialVocab(dir_model, load_merges=True)
+special_vocab = gguf.SpecialVocab(dir_model, load_merges=True, n_vocab = len(tokens))
 special_vocab.add_to_gguf(gguf_writer)
 
 # TENSORS
diff --git a/convert-falcon-hf-to-gguf.py b/convert-falcon-hf-to-gguf.py
index 9252e1c46..8e8f3c3f8 100755
--- a/convert-falcon-hf-to-gguf.py
+++ b/convert-falcon-hf-to-gguf.py
@@ -78,7 +78,7 @@ print("gguf: loading model "+dir_model.name)
 with open(dir_model / "config.json", "r", encoding="utf-8") as f:
     hparams = json.load(f)
 
-if hparams["architectures"][0] != "FalconForCausalLM":
+if hparams["architectures"][0] not in ("RWForCausalLM", "FalconForCausalLM"):
     print("Model architecture not supported: " + hparams["architectures"][0])
 
     sys.exit(1)
@@ -97,7 +97,17 @@ gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH])
 
 print("gguf: get model metadata")
 
-block_count = hparams["num_hidden_layers"]
+block_count = hparams.get("num_hidden_layers")
+if block_count is None:
+    block_count = hparams["n_layer"]  # old name
+
+n_head = hparams.get("num_attention_heads")
+if n_head is None:
+    n_head = hparams["n_head"]  # old name
+
+n_head_kv = hparams.get("num_kv_heads")
+if n_head_kv is None:
+    n_head_kv = hparams.get("n_head_kv", 1)  # old name
 
 gguf_writer.add_name("Falcon")
 gguf_writer.add_context_length(2048) # not in config.json
@@ -105,11 +115,8 @@ gguf_writer.add_tensor_data_layout("jploski") # qkv tensor transform
 gguf_writer.add_embedding_length(hparams["hidden_size"])
 gguf_writer.add_feed_forward_length(4 * hparams["hidden_size"])
 gguf_writer.add_block_count(block_count)
-gguf_writer.add_head_count(hparams["num_attention_heads"])
-if "num_kv_heads" in hparams:
-    gguf_writer.add_head_count_kv(hparams["num_kv_heads"])
-else:
-    gguf_writer.add_head_count_kv(1)
+gguf_writer.add_head_count(n_head)
+gguf_writer.add_head_count_kv(n_head_kv)
 gguf_writer.add_layer_norm_eps(hparams["layer_norm_epsilon"])
 gguf_writer.add_file_type(ftype)
 
@@ -145,17 +152,13 @@ gguf_writer.add_token_list(tokens)
 gguf_writer.add_token_scores(scores)
 gguf_writer.add_token_types(toktypes)
 
-special_vocab = gguf.SpecialVocab(dir_model, load_merges = True)
+special_vocab = gguf.SpecialVocab(dir_model, load_merges = True, n_vocab = len(tokens))
 special_vocab.add_to_gguf(gguf_writer)
 
 # TENSORS
 
 tensor_map = gguf.get_tensor_name_map(ARCH,block_count)
 
-# params for qkv transform
-n_head    = hparams["num_attention_heads"]
-n_head_kv = hparams["num_kv_heads"] if "num_kv_heads" in hparams else 1
-
 head_dim = hparams["hidden_size"] // n_head
 
 # tensor info
diff --git a/convert-gptneox-hf-to-gguf.py b/convert-gptneox-hf-to-gguf.py
index d4e85f518..02d1fdf16 100755
--- a/convert-gptneox-hf-to-gguf.py
+++ b/convert-gptneox-hf-to-gguf.py
@@ -123,18 +123,27 @@ tokenizer = AutoTokenizer.from_pretrained(dir_model)
 vocab_size = hparams.get("vocab_size", len(tokenizer.vocab))
 assert max(tokenizer.vocab.values()) < vocab_size
 
+added_vocab = tokenizer.get_added_vocab()
 reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()}
 
 for i in range(vocab_size):
-    tokens.append(reverse_vocab[i] if i in reverse_vocab else f"[PAD{i}]")
-    scores.append(0.0) # dummy
-    toktypes.append(gguf.TokenType.NORMAL)
+    if i not in reverse_vocab:
+        tokens.append(f"[PAD{i}]")
+        toktypes.append(gguf.TokenType.USER_DEFINED)
+    elif reverse_vocab[i] in added_vocab:
+        tokens.append(reverse_vocab[i])
+        if tokenizer.added_tokens_decoder[i].special:
+            toktypes.append(gguf.TokenType.CONTROL)
+        else:
+            toktypes.append(gguf.TokenType.USER_DEFINED)
+    else:
+        tokens.append(reverse_vocab[i])
+        toktypes.append(gguf.TokenType.NORMAL)
 
 gguf_writer.add_token_list(tokens)
-gguf_writer.add_token_scores(scores)
 gguf_writer.add_token_types(toktypes)
 
-special_vocab = gguf.SpecialVocab(dir_model, load_merges = True)
+special_vocab = gguf.SpecialVocab(dir_model, load_merges = True, n_vocab = len(tokens))
 special_vocab.add_to_gguf(gguf_writer)
 
 # TENSORS
diff --git a/convert-llama-ggml-to-gguf.py b/convert-llama-ggml-to-gguf.py
index b5d3e0b3c..871add64d 100755
--- a/convert-llama-ggml-to-gguf.py
+++ b/convert-llama-ggml-to-gguf.py
@@ -388,7 +388,9 @@ def handle_metadata(cfg, hp):
         cfg.vocab_dir if cfg.vocab_dir is not None else cfg.model_metadata_dir,
         cfg.vocabtype )
     # FIXME: Respect cfg.vocab_dir?
-    svocab = gguf.SpecialVocab(cfg.model_metadata_dir)
+    svocab = gguf.SpecialVocab(cfg.model_metadata_dir,
+        load_merges = cfg.vocabtype == 'bpe',
+        n_vocab = vocab.vocab_size)
     convert.check_vocab_size(params, vocab)
     return (params, vocab, svocab)
 
diff --git a/convert-mpt-hf-to-gguf.py b/convert-mpt-hf-to-gguf.py
index 73a4932f7..70d154b3f 100755
--- a/convert-mpt-hf-to-gguf.py
+++ b/convert-mpt-hf-to-gguf.py
@@ -98,6 +98,8 @@ gguf_writer.add_embedding_length(hparams["d_model"])
 gguf_writer.add_block_count(block_count)
 gguf_writer.add_feed_forward_length(4 * hparams["d_model"])
 gguf_writer.add_head_count(hparams["n_heads"])
+if kv_n_heads := hparams["attn_config"].get("kv_n_heads"):
+    gguf_writer.add_head_count_kv(kv_n_heads)
 gguf_writer.add_layer_norm_eps(1e-05)
 if hparams["attn_config"]["clip_qkv"] is not None:
     gguf_writer.add_clamp_kqv(hparams["attn_config"]["clip_qkv"])
@@ -126,18 +128,27 @@ vocab_size = hparams["vocab_size"]
 # ref: https://github.com/cmp-nct/ggllm.cpp/blob/master/falcon_convert.py
 tokenizer = AutoTokenizer.from_pretrained(dir_model)
 
+added_vocab = tokenizer.get_added_vocab()
 reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()}
 
 for i in range(vocab_size):
-    tokens.append(reverse_vocab[i] if i in reverse_vocab else f"[PAD{i}]")
-    scores.append(0.0) # dummy
-    toktypes.append(gguf.TokenType.NORMAL)
+    if i not in reverse_vocab:
+        tokens.append(f"[PAD{i}]")
+        toktypes.append(gguf.TokenType.USER_DEFINED)
+    elif reverse_vocab[i] in added_vocab:
+        tokens.append(reverse_vocab[i])
+        if tokenizer.added_tokens_decoder[i].special:
+            toktypes.append(gguf.TokenType.CONTROL)
+        else:
+            toktypes.append(gguf.TokenType.USER_DEFINED)
+    else:
+        tokens.append(reverse_vocab[i])
+        toktypes.append(gguf.TokenType.NORMAL)
 
 gguf_writer.add_token_list(tokens)
-gguf_writer.add_token_scores(scores)
 gguf_writer.add_token_types(toktypes)
 
-special_vocab = gguf.SpecialVocab(dir_model, load_merges = True)
+special_vocab = gguf.SpecialVocab(dir_model, load_merges = True, n_vocab = len(tokens))
 special_vocab.add_to_gguf(gguf_writer)
 
 # TENSORS
diff --git a/convert-refact-hf-to-gguf.py b/convert-refact-hf-to-gguf.py
index bfeabc082..f0cfe84d8 100755
--- a/convert-refact-hf-to-gguf.py
+++ b/convert-refact-hf-to-gguf.py
@@ -139,18 +139,27 @@ tokenizer = AutoTokenizer.from_pretrained(dir_model)
 vocab_size = hparams.get("vocab_size", len(tokenizer.vocab))
 assert max(tokenizer.vocab.values()) < vocab_size
 
+added_vocab = tokenizer.get_added_vocab()
 reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()}
 
 for i in range(vocab_size):
-    tokens.append(reverse_vocab[i] if i in reverse_vocab else f"[PAD{i}]")
-    scores.append(0.0) # dummy
-    toktypes.append(gguf.TokenType.NORMAL)
+    if i not in reverse_vocab:
+        tokens.append(f"[PAD{i}]")
+        toktypes.append(gguf.TokenType.USER_DEFINED)
+    elif reverse_vocab[i] in added_vocab:
+        tokens.append(reverse_vocab[i])
+        if tokenizer.added_tokens_decoder[i].special:
+            toktypes.append(gguf.TokenType.CONTROL)
+        else:
+            toktypes.append(gguf.TokenType.USER_DEFINED)
+    else:
+        tokens.append(reverse_vocab[i])
+        toktypes.append(gguf.TokenType.NORMAL)
 
 gguf_writer.add_token_list(tokens)
-gguf_writer.add_token_scores(scores)
 gguf_writer.add_token_types(toktypes)
 
-special_vocab = gguf.SpecialVocab(dir_model, load_merges=True)
+special_vocab = gguf.SpecialVocab(dir_model, load_merges=True, n_vocab = len(tokens))
 special_vocab.add_to_gguf(gguf_writer)
 
 # TENSORS
diff --git a/convert-starcoder-hf-to-gguf.py b/convert-starcoder-hf-to-gguf.py
index 90fa0c32f..a9bfed85e 100755
--- a/convert-starcoder-hf-to-gguf.py
+++ b/convert-starcoder-hf-to-gguf.py
@@ -111,18 +111,26 @@ tokenizer = AutoTokenizer.from_pretrained(dir_model)
 vocab_size = hparams.get("vocab_size", len(tokenizer.vocab))
 assert max(tokenizer.vocab.values()) < vocab_size
 
+added_vocab = tokenizer.get_added_vocab()
 reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()}
 
 for i in range(vocab_size):
-    tokens.append(reverse_vocab[i] if i in reverse_vocab else f"[PAD{i}]")
-    scores.append(0.0) # dummy
-    toktypes.append(gguf.TokenType.NORMAL)
+    if i not in reverse_vocab:
+        tokens.append(f"[PAD{i}]")
+        toktypes.append(gguf.TokenType.USER_DEFINED)
+    elif reverse_vocab[i] in added_vocab:
+        tokens.append(reverse_vocab[i])
+        if tokenizer.added_tokens_decoder[i].special:
+            toktypes.append(gguf.TokenType.CONTROL)
+        else:
+            toktypes.append(gguf.TokenType.USER_DEFINED)
+    else:
+        tokens.append(reverse_vocab[i])
+        toktypes.append(gguf.TokenType.NORMAL)
 
 gguf_writer.add_token_list(tokens)
-gguf_writer.add_token_scores(scores)
 gguf_writer.add_token_types(toktypes)
-
-special_vocab = gguf.SpecialVocab(dir_model, load_merges = True)
+special_vocab = gguf.SpecialVocab(dir_model, load_merges = True, n_vocab = len(tokens))
 special_vocab.add_to_gguf(gguf_writer)
 
 # TENSORS
diff --git a/convert.py b/convert.py
index e9b08d344..9110f1580 100755
--- a/convert.py
+++ b/convert.py
@@ -151,8 +151,11 @@ class Params:
     n_head_kv:  int
     f_norm_eps: float
 
+    rope_scaling_type: gguf.RopeScalingType | None = None
     f_rope_freq_base: float | None = None
     f_rope_scale: float | None = None
+    n_orig_ctx: int | None = None
+    rope_finetuned: bool | None = None
 
     ftype: GGMLFileType | None = None
 
@@ -198,20 +201,20 @@ class Params:
     def loadHFTransformerJson(model: LazyModel, config_path: Path) -> Params:
         config = json.load(open(config_path))
 
-        n_vocab          = config["vocab_size"]
-        n_embd           = config["hidden_size"]
-        n_layer          = config["num_hidden_layers"]
-        n_ff             = config["intermediate_size"]
-        n_head           = config["num_attention_heads"]
-        n_head_kv        = config["num_key_value_heads"] if "num_key_value_heads" in config else n_head
-        f_norm_eps       = config["rms_norm_eps"]
-        f_rope_freq_base = config["rope_theta"] if "rope_theta" in config else None
-
+        rope_scaling_type = f_rope_scale = n_orig_ctx = rope_finetuned = None
         rope_scaling = config.get("rope_scaling")
-        if isinstance(rope_scaling, dict) and rope_scaling.get("type") == "linear":
-            f_rope_scale = config["rope_scaling"].get("factor")
-        else:
-            f_rope_scale = None
+
+        if rope_scaling is not None and (typ := rope_scaling.get("type")):
+            rope_factor = rope_scaling.get("factor")
+            f_rope_scale = rope_factor
+            if typ == "linear":
+                rope_scaling_type = gguf.RopeScalingType.LINEAR
+            elif typ == "yarn":
+                rope_scaling_type = gguf.RopeScalingType.YARN
+                n_orig_ctx = rope_scaling['original_max_position_embeddings']
+                rope_finetuned = rope_scaling['finetuned']
+            else:
+                raise NotImplementedError(f'Unknown rope scaling type: {typ}')
 
         if "max_sequence_length" in config:
             n_ctx = config["max_sequence_length"]
@@ -222,16 +225,19 @@ class Params:
                             "Suggestion: provide 'config.json' of the model in the same directory containing model files.")
 
         return Params(
-            n_vocab          = n_vocab,
-            n_embd           = n_embd,
-            n_layer          = n_layer,
-            n_ctx            = n_ctx,
-            n_ff             = n_ff,
-            n_head           = n_head,
-            n_head_kv        = n_head_kv,
-            f_norm_eps       = f_norm_eps,
-            f_rope_freq_base = f_rope_freq_base,
-            f_rope_scale     = f_rope_scale,
+            n_vocab           = config["vocab_size"],
+            n_embd            = config["hidden_size"],
+            n_layer           = config["num_hidden_layers"],
+            n_ctx             = n_ctx,
+            n_ff              = config["intermediate_size"],
+            n_head            = (n_head := config["num_attention_heads"]),
+            n_head_kv         = config.get("num_key_value_heads", n_head),
+            f_norm_eps        = config["rms_norm_eps"],
+            f_rope_freq_base  = config.get("rope_theta"),
+            rope_scaling_type = rope_scaling_type,
+            f_rope_scale      = f_rope_scale,
+            n_orig_ctx        = n_orig_ctx,
+            rope_finetuned    = rope_finetuned,
         )
 
     # LLaMA v2 70B params.json
@@ -240,17 +246,8 @@ class Params:
     def loadOriginalParamsJson(model: LazyModel, config_path: Path) -> Params:
         config = json.load(open(config_path))
 
-        n_vocab          = config["vocab_size"] if "vocab_size" in config else -1
-        n_embd           = config["dim"]
-        n_layer          = config["n_layers"]
-        n_ff             = -1
-        n_head           = config["n_heads"]
-        n_head_kv        = config["n_kv_heads"] if "n_kv_heads" in config else n_head
-        f_norm_eps       = config["norm_eps"]
-        f_rope_freq_base = config["rope_theta"] if "rope_theta" in config else None
-
         # hack to determine LLaMA v1 vs v2 vs CodeLlama
-        if f_rope_freq_base == 1000000:
+        if config.get("rope_theta") == 1000000:
             # CodeLlama
             n_ctx = 16384
         elif config["norm_eps"] == 1e-05:
@@ -260,22 +257,16 @@ class Params:
             # LLaMA v1
             n_ctx = 2048
 
-        if n_vocab == -1:
-            n_vocab = model["tok_embeddings.weight"].shape[0]
-
-        if n_ff == -1:
-            n_ff = model["layers.0.feed_forward.w1.weight"].shape[0]
-
         return Params(
-            n_vocab          = n_vocab,
-            n_embd           = n_embd,
-            n_layer          = n_layer,
+            n_vocab          = config.get("vocab_size", model["tok_embeddings.weight"].shape[0]),
+            n_embd           = config["dim"],
+            n_layer          = config["n_layers"],
             n_ctx            = n_ctx,
-            n_ff             = n_ff,
-            n_head           = n_head,
-            n_head_kv        = n_head_kv,
-            f_norm_eps       = f_norm_eps,
-            f_rope_freq_base = f_rope_freq_base,
+            n_ff             = model["layers.0.feed_forward.w1.weight"].shape[0],
+            n_head           = (n_head := config["n_heads"]),
+            n_head_kv        = config.get("n_kv_heads", n_head),
+            f_norm_eps       = config["norm_eps"],
+            f_rope_freq_base = config.get("rope_theta"),
         )
 
     @staticmethod
@@ -366,16 +357,19 @@ class SentencePieceVocab:
             added_tokens = {}
 
         vocab_size: int = self.sentencepiece_tokenizer.vocab_size()
-        expected_ids = list(range(vocab_size, vocab_size + len(added_tokens)))
-        actual_ids   = sorted(added_tokens.values())
-        if expected_ids != actual_ids:
-            raise Exception(f"Expected added token IDs to be sequential and start at {len(added_tokens)}; got {actual_ids}")
 
-        items = sorted(added_tokens.items(), key=lambda text_idx: text_idx[1])
-        self.added_tokens_list = [text for (text, idx) in items]
-        self.vocab_size_base: int = vocab_size
-        self.vocab_size: int = self.vocab_size_base + len(self.added_tokens_list)
-        self.fname_tokenizer = fname_tokenizer
+        new_tokens       = {id: piece for piece, id in added_tokens.items() if id >= vocab_size}
+        expected_new_ids = list(range(vocab_size, vocab_size + len(new_tokens)))
+        actual_new_ids   = sorted(new_tokens.keys())
+
+        if expected_new_ids != actual_new_ids:
+            raise ValueError(f"Expected new token IDs {expected_new_ids} to be sequential; got {actual_new_ids}")
+
+        # Token pieces that were added to the base vocabulary.
+        self.added_tokens_list  = [new_tokens[id] for id in actual_new_ids]
+        self.vocab_size_base    = vocab_size
+        self.vocab_size         = self.vocab_size_base + len(self.added_tokens_list)
+        self.fname_tokenizer    = fname_tokenizer
         self.fname_added_tokens = fname_added_tokens
 
     def sentencepiece_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
@@ -803,8 +797,8 @@ def check_vocab_size(params: Params, vocab: Vocab) -> None:
 
 
 class OutputFile:
-    def __init__(self, fname_out: Path) -> None:
-        self.gguf = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH])
+    def __init__(self, fname_out: Path, endianess:gguf.GGUFEndian=gguf.GGUFEndian.LITTLE) -> None:
+        self.gguf = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH], endianess=endianess)
 
     def add_meta_arch(self, params: Params) -> None:
         name = "LLaMA"
@@ -828,8 +822,16 @@ class OutputFile:
         if params.f_rope_freq_base is not None:
             self.gguf.add_rope_freq_base(params.f_rope_freq_base)
 
-        if params.f_rope_scale is not None:
-            self.gguf.add_rope_scale_linear(params.f_rope_scale)
+        if params.rope_scaling_type:
+            assert params.f_rope_scale is not None
+            self.gguf.add_rope_scaling_type(params.rope_scaling_type)
+            self.gguf.add_rope_scaling_factor(params.f_rope_scale)
+
+        if params.n_orig_ctx is not None:
+            self.gguf.add_rope_scaling_orig_ctx_len(params.n_orig_ctx)
+
+        if params.rope_finetuned is not None:
+            self.gguf.add_rope_scaling_finetuned(params.rope_finetuned)
 
         if params.ftype is not None:
             self.gguf.add_file_type(params.ftype)
@@ -875,10 +877,10 @@ class OutputFile:
         self.gguf.close()
 
     @staticmethod
-    def write_vocab_only(fname_out: Path, params: Params, vocab: Vocab, svocab: gguf.SpecialVocab) -> None:
+    def write_vocab_only(fname_out: Path, params: Params, vocab: Vocab, svocab: gguf.SpecialVocab, endianess:gguf.GGUFEndian=gguf.GGUFEndian.LITTLE) -> None:
         check_vocab_size(params, vocab)
 
-        of = OutputFile(fname_out)
+        of = OutputFile(fname_out, endianess=endianess)
 
         # meta data
         of.add_meta_arch(params)
@@ -903,10 +905,10 @@ class OutputFile:
         return dt.quantize(arr)
 
     @staticmethod
-    def write_all(fname_out: Path, ftype: GGMLFileType, params: Params, model: LazyModel, vocab: Vocab, svocab: gguf.SpecialVocab, concurrency: int = DEFAULT_CONCURRENCY) -> None:
+    def write_all(fname_out: Path, ftype: GGMLFileType, params: Params, model: LazyModel, vocab: Vocab, svocab: gguf.SpecialVocab, concurrency: int = DEFAULT_CONCURRENCY, endianess=gguf.GGUFEndian.LITTLE) -> None:
         check_vocab_size(params, vocab)
 
-        of = OutputFile(fname_out)
+        of = OutputFile(fname_out, endianess=endianess)
 
         # meta data
         of.add_meta_arch(params)
@@ -1123,8 +1125,9 @@ def main(args_in: list[str] | None = None) -> None:
     parser.add_argument("--vocabtype",   choices=["spm", "bpe"], help="vocab format (default: spm)", default="spm")
     parser.add_argument("--ctx",         type=int,               help="model training context (default: based on input)")
     parser.add_argument("--concurrency", type=int,               help=f"concurrency used for conversion (default: {DEFAULT_CONCURRENCY})", default = DEFAULT_CONCURRENCY)
-    args = parser.parse_args(args_in)
+    parser.add_argument("--bigendian",   action="store_true",    help="model is executed on big endian machine")
 
+    args = parser.parse_args(args_in)
     if args.dump_single:
         model_plus = lazy_load_file(args.model)
         do_dump_model(model_plus)
@@ -1138,6 +1141,9 @@ def main(args_in: list[str] | None = None) -> None:
     if args.dump:
         do_dump_model(model_plus)
         return
+    endianess = gguf.GGUFEndian.LITTLE
+    if args.bigendian:
+        endianess = gguf.GGUFEndian.BIG
 
     params = Params.load(model_plus)
     if params.n_ctx == -1:
@@ -1159,10 +1165,13 @@ def main(args_in: list[str] | None = None) -> None:
 
     vocab: Vocab
     if args.vocab_only:
-        assert args.outfile, "need --outfile if using --vocab-only"
+        if not args.outfile:
+            raise ValueError("need --outfile if using --vocab-only")
         # FIXME: Try to respect vocab_dir somehow?
         vocab = load_vocab(args.vocab_dir or args.model, args.vocabtype)
-        special_vocab = gguf.SpecialVocab(model_plus.paths[0].parent, load_merges = args.vocabtype == 'bpe')
+        special_vocab = gguf.SpecialVocab(model_plus.paths[0].parent,
+            load_merges = args.vocabtype == 'bpe',
+            n_vocab = vocab.vocab_size)
         outfile = args.outfile
         OutputFile.write_vocab_only(outfile, params, vocab, special_vocab)
         print(f"Wrote {outfile}")
@@ -1174,7 +1183,9 @@ def main(args_in: list[str] | None = None) -> None:
         vocab_dir = args.vocab_dir if args.vocab_dir else model_plus.paths[0].parent
         vocab = load_vocab(vocab_dir, args.vocabtype)
     # FIXME: Try to respect vocab_dir somehow?
-    special_vocab = gguf.SpecialVocab(model_plus.paths[0].parent, load_merges = args.vocabtype == 'bpe')
+    special_vocab = gguf.SpecialVocab(model_plus.paths[0].parent,
+        load_merges = args.vocabtype == 'bpe',
+        n_vocab = vocab.vocab_size)
 
     model   = model_plus.model
     model   = convert_model_names(model, params)
@@ -1185,7 +1196,7 @@ def main(args_in: list[str] | None = None) -> None:
     params.ftype = ftype
     print(f"Writing {outfile}, format {ftype}")
 
-    OutputFile.write_all(outfile, ftype, params, model, vocab, special_vocab, concurrency = args.concurrency)
+    OutputFile.write_all(outfile, ftype, params, model, vocab, special_vocab, concurrency = args.concurrency, endianess=endianess)
     print(f"Wrote {outfile}")
 
 
diff --git a/docs/BLIS.md b/docs/BLIS.md
index f3d2312b4..0bcd6eeef 100644
--- a/docs/BLIS.md
+++ b/docs/BLIS.md
@@ -49,7 +49,7 @@ According to the BLIS documentation, we could set the following
 environment variables to modify the behavior of openmp:
 
 ```bash
-export GOMP_GPU_AFFINITY="0-19"
+export GOMP_CPU_AFFINITY="0-19"
 export BLIS_NUM_THREADS=14
 ```
 
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index ab8459370..75b8df676 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -12,25 +12,26 @@ include_directories(${CMAKE_CURRENT_SOURCE_DIR})
 
 if (EMSCRIPTEN)
 else()
-    add_subdirectory(main)
-    add_subdirectory(quantize)
-    add_subdirectory(quantize-stats)
-    add_subdirectory(perplexity)
-    add_subdirectory(embedding)
-    add_subdirectory(save-load-state)
-    add_subdirectory(benchmark)
     add_subdirectory(baby-llama)
-    add_subdirectory(train-text-from-scratch)
-    add_subdirectory(finetune)
-    add_subdirectory(convert-llama2c-to-ggml)
-    add_subdirectory(simple)
     add_subdirectory(batched)
     add_subdirectory(batched-bench)
-    add_subdirectory(speculative)
-    add_subdirectory(parallel)
-    add_subdirectory(embd-input)
-    add_subdirectory(llama-bench)
     add_subdirectory(beam-search)
+    add_subdirectory(benchmark)
+    add_subdirectory(convert-llama2c-to-ggml)
+    add_subdirectory(embedding)
+    add_subdirectory(finetune)
+    add_subdirectory(infill)
+    add_subdirectory(llama-bench)
+    add_subdirectory(llava)
+    add_subdirectory(main)
+    add_subdirectory(parallel)
+    add_subdirectory(perplexity)
+    add_subdirectory(quantize)
+    add_subdirectory(quantize-stats)
+    add_subdirectory(save-load-state)
+    add_subdirectory(simple)
+    add_subdirectory(speculative)
+    add_subdirectory(train-text-from-scratch)
     if (LLAMA_METAL)
         add_subdirectory(metal)
     endif()
diff --git a/examples/batched-bench/batched-bench.cpp b/examples/batched-bench/batched-bench.cpp
index 3e1e0716d..533c55c17 100644
--- a/examples/batched-bench/batched-bench.cpp
+++ b/examples/batched-bench/batched-bench.cpp
@@ -114,7 +114,7 @@ int main(int argc, char ** argv) {
         return 1;
     }
 
-    llama_batch batch = llama_batch_init(n_kv_max, 0);
+    llama_batch batch = llama_batch_init(n_kv_max, 0, 1);
 
     // decode in batches of ctx_params.n_batch tokens
     auto decode_helper = [](llama_context * ctx, llama_batch & batch, int32_t n_batch) {
@@ -123,11 +123,12 @@ int main(int argc, char ** argv) {
 
             llama_batch batch_view = {
                 n_tokens,
-                batch.token  + i,
+                batch.token    + i,
                 nullptr,
-                batch.pos    + i,
-                batch.seq_id + i,
-                batch.logits + i,
+                batch.pos      + i,
+                batch.n_seq_id + i,
+                batch.seq_id   + i,
+                batch.logits   + i,
                 0, 0, 0, // unused
             };
 
@@ -143,13 +144,8 @@ int main(int argc, char ** argv) {
 
     // warm up
     {
-        batch.n_tokens = 16;
-
-        for (int i = 0; i < batch.n_tokens; ++i) {
-            batch.token[i]  = 0;
-            batch.pos[i]    = i;
-            batch.seq_id[i] = 0;
-            batch.logits[i] = false;
+        for (int i = 0; i < 16; ++i) {
+            llama_batch_add(batch, 0, i, { 0 }, false);
         }
 
         if (!decode_helper(ctx, batch, ctx_params.n_batch)) {
@@ -158,6 +154,10 @@ int main(int argc, char ** argv) {
         }
     }
 
+    LOG_TEE("\n");
+    LOG_TEE("%s: n_kv_max = %d, is_pp_shared = %d, n_gpu_layers = %d, mmq = %d\n", __func__, n_kv_max, is_pp_shared, n_gpu_layers, mmq);
+    LOG_TEE("\n");
+
     LOG_TEE("|%6s | %6s | %4s | %6s | %8s | %8s | %8s | %8s | %8s | %8s |\n", "PP",     "TG",     "B",    "N_KV",     "T_PP s",   "S_PP t/s", "T_TG s",   "S_TG t/s", "T s",      "S t/s");
     LOG_TEE("|%6s-|-%6s-|-%4s-|-%6s-|-%8s-|-%8s-|-%8s-|-%8s-|-%8s-|-%8s-|\n", "------", "------", "----", "------", "--------", "--------", "--------", "--------", "--------", "--------");
 
@@ -174,19 +174,18 @@ int main(int argc, char ** argv) {
                     continue;
                 }
 
-                batch.n_tokens = is_pp_shared ? pp : pl*pp;
+                llama_batch_clear(batch);
 
-                for (int i = 0; i < batch.n_tokens; ++i) {
-                    batch.token[i]  = 0;
-                    batch.pos[i]    = i;
-                    batch.seq_id[i] = 0;
-                    batch.logits[i] = false;
+                const int n_tokens = is_pp_shared ? pp : pl*pp;
+
+                for (int i = 0; i < n_tokens; ++i) {
+                    llama_batch_add(batch, 0, i, { 0 }, false);
                 }
                 batch.logits[batch.n_tokens - 1] = true;
 
                 const auto t_pp_start = ggml_time_us();
 
-                llama_kv_cache_tokens_rm(ctx, -1, -1);
+                llama_kv_cache_clear(ctx);
 
                 if (!decode_helper(ctx, batch, ctx_params.n_batch)) {
                     LOG_TEE("%s: llama_decode() failed\n", __func__);
@@ -204,13 +203,10 @@ int main(int argc, char ** argv) {
                 const auto t_tg_start = ggml_time_us();
 
                 for (int i = 0; i < tg; ++i) {
-                    batch.n_tokens = pl;
+                    llama_batch_clear(batch);
 
                     for (int j = 0; j < pl; ++j) {
-                        batch.token[j]  = 0;
-                        batch.pos[j]    = pp + i;
-                        batch.seq_id[j] = j;
-                        batch.logits[j] = true;
+                        llama_batch_add(batch, 0, pp + i, { j }, true);
                     }
 
                     if (!decode_helper(ctx, batch, ctx_params.n_batch)) {
diff --git a/examples/batched.swift/Sources/main.swift b/examples/batched.swift/Sources/main.swift
index 938f30512..772730382 100644
--- a/examples/batched.swift/Sources/main.swift
+++ b/examples/batched.swift/Sources/main.swift
@@ -69,7 +69,7 @@ for id: llama_token in tokens {
 
 print("\n")
 
-var batch = llama_batch_init(max(Int32(tokens.count), Int32(n_parallel)), 0)
+var batch = llama_batch_init(max(Int32(tokens.count), Int32(n_parallel)), 0, 1)
 defer {
     llama_batch_free(batch)
 }
@@ -80,7 +80,12 @@ batch.n_tokens = Int32(tokens.count)
 for (i, token) in tokens.enumerated() {
     batch.token[i] = token
     batch.pos[i] = Int32(i)
-    batch.seq_id[i] = 0
+    batch.n_seq_id[i] = 1
+    // batch.seq_id[i][0] = 0
+    // TODO: is this the proper way to do this?
+    if let seq_id = batch.seq_id[i] {
+        seq_id[0] = 0
+    }
     batch.logits[i] = 0
 }
 
@@ -169,7 +174,10 @@ while n_cur <= n_len {
         // push this new token for next evaluation
         batch.token[Int(batch.n_tokens)] = new_token_id
         batch.pos[Int(batch.n_tokens)] = n_cur
-        batch.seq_id[Int(batch.n_tokens)] = Int32(i)
+        batch.n_seq_id[Int(batch.n_tokens)] = 1
+        if let seq_id = batch.seq_id[Int(batch.n_tokens)] {
+            seq_id[0] = Int32(i)
+        }
         batch.logits[Int(batch.n_tokens)] = 1
 
         i_batch[i] = batch.n_tokens
@@ -209,7 +217,7 @@ llama_print_timings(context)
 private func tokenize(text: String, add_bos: Bool) -> [llama_token] {
     let n_tokens = text.count + (add_bos ? 1 : 0)
     let tokens = UnsafeMutablePointer<llama_token>.allocate(capacity: n_tokens)
-    let tokenCount = llama_tokenize(model, text, Int32(text.count), tokens, Int32(n_tokens), add_bos)
+    let tokenCount = llama_tokenize(model, text, Int32(text.count), tokens, Int32(n_tokens), add_bos, /*special tokens*/ false)
     var swiftTokens: [llama_token] = []
     for i in 0 ..< tokenCount {
         swiftTokens.append(tokens[Int(i)])
diff --git a/examples/batched/batched.cpp b/examples/batched/batched.cpp
index a88e022d6..22a4265df 100644
--- a/examples/batched/batched.cpp
+++ b/examples/batched/batched.cpp
@@ -11,12 +11,19 @@ int main(int argc, char ** argv) {
     gpt_params params;
 
     if (argc == 1 || argv[1][0] == '-') {
-        printf("usage: %s MODEL_PATH [PROMPT] [PARALLEL]\n" , argv[0]);
+        printf("usage: %s MODEL_PATH [PROMPT] [PARALLEL] [LEN] [NGL]\n" , argv[0]);
         return 1 ;
     }
 
+    // number of parallel batches
     int n_parallel = 1;
 
+    // total length of the sequences including the prompt
+    int n_len = 32;
+
+    // number of layers to offload to the GPU
+    int n_gpu_layers = 0;
+
     if (argc >= 2) {
         params.model = argv[1];
     }
@@ -29,13 +36,18 @@ int main(int argc, char ** argv) {
         n_parallel = std::atoi(argv[3]);
     }
 
+    if (argc >= 5) {
+        n_len = std::atoi(argv[4]);
+    }
+
+    if (argc >= 6) {
+        n_gpu_layers = std::atoi(argv[5]);
+    }
+
     if (params.prompt.empty()) {
         params.prompt = "Hello my name is";
     }
 
-    // total length of the sequences including the prompt
-    const int n_len = 32;
-
     // init LLM
 
     llama_backend_init(params.numa);
@@ -44,7 +56,7 @@ int main(int argc, char ** argv) {
 
     llama_model_params model_params = llama_model_default_params();
 
-    // model_params.n_gpu_layers = 99; // offload all layers to the GPU
+    model_params.n_gpu_layers = n_gpu_layers;
 
     llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params);
 
@@ -97,20 +109,15 @@ int main(int argc, char ** argv) {
 
     fflush(stderr);
 
-    // create a llama_batch with size 512
+    // create a llama_batch
     // we use this object to submit token data for decoding
-
-    llama_batch batch = llama_batch_init(std::max(tokens_list.size(), (size_t)n_parallel), 0);
+    llama_batch batch = llama_batch_init(std::max(tokens_list.size(), (size_t)n_parallel), 0, 1);
 
     // evaluate the initial prompt
-    batch.n_tokens = tokens_list.size();
-
-    for (int32_t i = 0; i < batch.n_tokens; i++) {
-        batch.token[i]  = tokens_list[i];
-        batch.pos[i]    = i;
-        batch.seq_id[i] = 0;
-        batch.logits[i] = false;
+    for (size_t i = 0; i < tokens_list.size(); ++i) {
+        llama_batch_add(batch, tokens_list[i], i, { 0 }, false);
     }
+    GGML_ASSERT(batch.n_tokens == (int) tokens_list.size());
 
     // llama_decode will output logits only for the last token of the prompt
     batch.logits[batch.n_tokens - 1] = true;
@@ -146,7 +153,7 @@ int main(int argc, char ** argv) {
 
     while (n_cur <= n_len) {
         // prepare the next batch
-        batch.n_tokens = 0;
+        llama_batch_clear(batch);
 
         // sample the next token for each parallel sequence / stream
         for (int32_t i = 0; i < n_parallel; ++i) {
@@ -180,7 +187,7 @@ int main(int argc, char ** argv) {
             //const llama_token new_token_id = llama_sample_token_greedy(ctx, &candidates_p);
 
             // is it an end of stream? -> mark the stream as finished
-            if (new_token_id == llama_token_eos(ctx) || n_cur == n_len) {
+            if (new_token_id == llama_token_eos(model) || n_cur == n_len) {
                 i_batch[i] = -1;
                 LOG_TEE("\n");
                 if (n_parallel > 1) {
@@ -198,15 +205,10 @@ int main(int argc, char ** argv) {
 
             streams[i] += llama_token_to_piece(ctx, new_token_id);
 
-            // push this new token for next evaluation
-            batch.token [batch.n_tokens] = new_token_id;
-            batch.pos   [batch.n_tokens] = n_cur;
-            batch.seq_id[batch.n_tokens] = i;
-            batch.logits[batch.n_tokens] = true;
-
             i_batch[i] = batch.n_tokens;
 
-            batch.n_tokens += 1;
+            // push this new token for next evaluation
+            llama_batch_add(batch, new_token_id, n_cur, { i }, true);
 
             n_decode += 1;
         }
diff --git a/examples/beam-search/beam-search.cpp b/examples/beam-search/beam-search.cpp
index f078ab8a8..679b382e1 100644
--- a/examples/beam-search/beam-search.cpp
+++ b/examples/beam-search/beam-search.cpp
@@ -47,7 +47,7 @@ struct beam_search_callback_data {
 // In this case, end-of-beam (eob) is equivalent to end-of-sentence (eos) but this need not always be the same.
 // For example, eob can be flagged due to maximum token length, stop words, etc.
 static bool is_at_eob(const beam_search_callback_data & callback_data, const llama_token * tokens, size_t n_tokens) {
-    return n_tokens && tokens[n_tokens-1] == llama_token_eos(callback_data.ctx);
+    return n_tokens && tokens[n_tokens-1] == llama_token_eos(llama_get_model(callback_data.ctx));
 }
 
 // Function matching type llama_beam_search_callback_fn_t.
diff --git a/examples/benchmark/CMakeLists.txt b/examples/benchmark/CMakeLists.txt
index 14916d831..2bb47bab5 100644
--- a/examples/benchmark/CMakeLists.txt
+++ b/examples/benchmark/CMakeLists.txt
@@ -1,9 +1,6 @@
 set(TARGET benchmark)
 add_executable(${TARGET} benchmark-matmult.cpp)
 install(TARGETS ${TARGET} RUNTIME)
-target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT})
+target_link_libraries(${TARGET} PRIVATE llama build_info ${CMAKE_THREAD_LIBS_INIT})
 target_include_directories(${TARGET} PRIVATE ../../common)
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
-if(TARGET BUILD_INFO)
-  add_dependencies(${TARGET} BUILD_INFO)
-endif()
diff --git a/examples/benchmark/benchmark-matmult.cpp b/examples/benchmark/benchmark-matmult.cpp
index f1c382aa9..76e3f57cc 100644
--- a/examples/benchmark/benchmark-matmult.cpp
+++ b/examples/benchmark/benchmark-matmult.cpp
@@ -1,4 +1,3 @@
-#include "build-info.h"
 #include "common.h"
 #include "ggml.h"
 
diff --git a/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp b/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp
index c291f0adf..cae3bf3c3 100644
--- a/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp
+++ b/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp
@@ -536,7 +536,7 @@ static bool is_ggml_file(const char * filename) {
     if (file.size < 4) {
         return false;
     }
-    uint32_t magic = file.read_u32();
+    std::string magic = file.read_string(4);
     return magic == GGUF_MAGIC;
 }
 
diff --git a/examples/embd-input/.gitignore b/examples/embd-input/.gitignore
deleted file mode 100644
index 87ef68771..000000000
--- a/examples/embd-input/.gitignore
+++ /dev/null
@@ -1,4 +0,0 @@
-PandaGPT
-MiniGPT-4
-*.pth
-
diff --git a/examples/embd-input/CMakeLists.txt b/examples/embd-input/CMakeLists.txt
deleted file mode 100644
index 5bbb1ea02..000000000
--- a/examples/embd-input/CMakeLists.txt
+++ /dev/null
@@ -1,17 +0,0 @@
-set(TARGET embdinput)
-add_library(${TARGET} embd-input-lib.cpp embd-input.h)
-install(TARGETS ${TARGET} LIBRARY)
-target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
-target_compile_features(${TARGET} PRIVATE cxx_std_11)
-if(TARGET BUILD_INFO)
-  add_dependencies(${TARGET} BUILD_INFO)
-endif()
-
-set(TARGET embd-input-test)
-add_executable(${TARGET} embd-input-test.cpp)
-install(TARGETS ${TARGET} RUNTIME)
-target_link_libraries(${TARGET} PRIVATE common llama embdinput ${CMAKE_THREAD_LIBS_INIT})
-target_compile_features(${TARGET} PRIVATE cxx_std_11)
-if(TARGET BUILD_INFO)
-  add_dependencies(${TARGET} BUILD_INFO)
-endif()
diff --git a/examples/embd-input/README.md b/examples/embd-input/README.md
deleted file mode 100644
index 5c4c75ea7..000000000
--- a/examples/embd-input/README.md
+++ /dev/null
@@ -1,63 +0,0 @@
-### Examples for input embedding directly
-
-## Requirement
-build  `libembdinput.so`
-run the following comman in main dir (../../).
-```
-make
-```
-
-## [LLaVA](https://github.com/haotian-liu/LLaVA/) example  (llava.py)
-
-1. Obtian LLaVA model (following https://github.com/haotian-liu/LLaVA/ , use https://huggingface.co/liuhaotian/LLaVA-13b-delta-v1-1/).
-2. Convert it to ggml format.
-3. `llava_projection.pth` is [pytorch_model-00003-of-00003.bin](https://huggingface.co/liuhaotian/LLaVA-13b-delta-v1-1/blob/main/pytorch_model-00003-of-00003.bin).
-
-```
-import torch
-
-bin_path = "../LLaVA-13b-delta-v1-1/pytorch_model-00003-of-00003.bin"
-pth_path = "./examples/embd-input/llava_projection.pth"
-
-dic = torch.load(bin_path)
-used_key = ["model.mm_projector.weight","model.mm_projector.bias"]
-torch.save({k: dic[k] for k in used_key}, pth_path)
-```
-4. Check the path of LLaVA model and `llava_projection.pth` in `llava.py`.
-
-
-## [PandaGPT](https://github.com/yxuansu/PandaGPT) example (panda_gpt.py)
-
-1. Obtian PandaGPT lora model from https://github.com/yxuansu/PandaGPT. Rename the file to `adapter_model.bin`. Use [convert-lora-to-ggml.py](../../convert-lora-to-ggml.py) to convert it to ggml format.
-The `adapter_config.json` is
-```
-{
-  "peft_type": "LORA",
-  "fan_in_fan_out": false,
-  "bias": null,
-  "modules_to_save": null,
-  "r": 32,
-  "lora_alpha": 32,
-  "lora_dropout": 0.1,
-  "target_modules": ["q_proj", "k_proj", "v_proj", "o_proj"]
-}
-```
-2. Papare the `vicuna` v0 model.
-3. Obtain the [ImageBind](https://dl.fbaipublicfiles.com/imagebind/imagebind_huge.pth) model.
-4. Clone the PandaGPT source.
-```
-git clone https://github.com/yxuansu/PandaGPT
-```
-5. Install the requirement of PandaGPT.
-6. Check the path of PandaGPT source, ImageBind model, lora model and vicuna model in panda_gpt.py.
-
-## [MiniGPT-4](https://github.com/Vision-CAIR/MiniGPT-4/) example (minigpt4.py)
-
-1. Obtain MiniGPT-4 model from https://github.com/Vision-CAIR/MiniGPT-4/ and put it in `embd-input`.
-2. Clone the MiniGPT-4 source.
-```
-git clone https://github.com/Vision-CAIR/MiniGPT-4/
-```
-3. Install the requirement of PandaGPT.
-4. Papare the `vicuna` v0 model.
-5. Check the path of MiniGPT-4 source, MiniGPT-4 model and vicuna model in `minigpt4.py`.
diff --git a/examples/embd-input/embd-input-lib.cpp b/examples/embd-input/embd-input-lib.cpp
deleted file mode 100644
index 87a5a1c26..000000000
--- a/examples/embd-input/embd-input-lib.cpp
+++ /dev/null
@@ -1,221 +0,0 @@
-#include "build-info.h"
-#include "common.h"
-#include "embd-input.h"
-
-#include <cassert>
-#include <cinttypes>
-#include <cmath>
-#include <cstdio>
-#include <cstring>
-#include <ctime>
-#include <fstream>
-#include <iostream>
-#include <string>
-#include <vector>
-
-static llama_context ** g_ctx;
-
-extern "C" {
-
-struct MyModel* create_mymodel(int argc, char ** argv) {
-    gpt_params params;
-
-    if (!gpt_params_parse(argc, argv, params)) {
-        return nullptr;
-    }
-
-    print_build_info();
-
-    if (params.seed == LLAMA_DEFAULT_SEED) {
-        params.seed = uint32_t(time(NULL));
-    }
-    fprintf(stderr, "%s: seed  = %d\n", __func__, params.seed);
-
-    llama_backend_init(params.numa);
-
-    llama_model * model;
-    llama_context * ctx;
-
-    g_ctx = &ctx;
-
-    // load the model and apply lora adapter, if any
-    std::tie(model, ctx) = llama_init_from_gpt_params(params);
-    if (model == NULL) {
-        fprintf(stderr, "%s: error: unable to load model\n", __func__);
-        return nullptr;
-    }
-
-    // print system information
-    {
-        fprintf(stderr, "\n");
-        fprintf(stderr, "%s\n", get_system_info(params).c_str());
-    }
-    struct MyModel * ret = new MyModel();
-    ret->ctx = ctx;
-    ret->params = params;
-    ret->n_past = 0;
-    // printf("ctx: %d\n", ret->ctx);
-    return ret;
-}
-
-void free_mymodel(struct MyModel * mymodel) {
-    llama_context * ctx = mymodel->ctx;
-    llama_print_timings(ctx);
-    llama_free(ctx);
-    delete mymodel;
-}
-
-
-bool eval_float(void * model, float * input, int N){
-    MyModel * mymodel = (MyModel*)model;
-    llama_context * ctx = mymodel->ctx;
-    gpt_params params = mymodel->params;
-    int n_emb = llama_n_embd(llama_get_model(ctx));
-    int n_past = mymodel->n_past;
-    int n_batch = N; // params.n_batch;
-
-    for (int i = 0; i < (int) N; i += n_batch) {
-        int n_eval = (int) N - i;
-        if (n_eval > n_batch) {
-            n_eval = n_batch;
-        }
-        llama_batch batch = {  int32_t(n_eval), nullptr, (input+i*n_emb), nullptr, nullptr, nullptr, n_past, 1, 0, };
-        if (llama_decode(ctx, batch)) {
-            fprintf(stderr, "%s : failed to eval\n", __func__);
-            return false;
-        }
-        n_past += n_eval;
-    }
-    mymodel->n_past = n_past;
-    return true;
-}
-
-bool eval_tokens(void * model, std::vector<llama_token> tokens) {
-    MyModel * mymodel = (MyModel* )model;
-    llama_context * ctx;
-    ctx = mymodel->ctx;
-    gpt_params params = mymodel->params;
-    int n_past = mymodel->n_past;
-    for (int i = 0; i < (int) tokens.size(); i += params.n_batch) {
-        int n_eval = (int) tokens.size() - i;
-        if (n_eval > params.n_batch) {
-            n_eval = params.n_batch;
-        }
-        if (llama_decode(ctx, llama_batch_get_one(&tokens[i], n_eval, n_past, 0))) {
-            fprintf(stderr, "%s : failed to eval\n", __func__);
-            return false;
-        }
-        n_past += n_eval;
-    }
-    mymodel->n_past = n_past;
-    return true;
-}
-
-bool eval_id(struct MyModel* mymodel, int id) {
-    std::vector<llama_token> tokens;
-    tokens.push_back(id);
-    return eval_tokens(mymodel, tokens);
-}
-
-bool eval_string(struct MyModel * mymodel,const char* str){
-    llama_context * ctx = mymodel->ctx;
-    std::string str2 = str;
-    std::vector<llama_token> embd_inp = ::llama_tokenize(ctx, str2, true);
-    eval_tokens(mymodel, embd_inp);
-    return true;
-}
-
-llama_token sampling_id(struct MyModel* mymodel) {
-    llama_context* ctx = mymodel->ctx;
-    gpt_params params = mymodel->params;
-    llama_sampling_params & sparams = params.sampling_params;
-    // int n_ctx = llama_n_ctx(ctx);
-
-    // out of user input, sample next token
-    const float   temp            = sparams.temp;
-    const int32_t top_k           = sparams.top_k <= 0 ? llama_n_vocab(llama_get_model(ctx)) : sparams.top_k;
-    const float   top_p           = sparams.top_p;
-    const float   tfs_z           = sparams.tfs_z;
-    const float   typical_p       = sparams.typical_p;
-    // const int32_t repeat_last_n   = params.repeat_last_n < 0 ? n_ctx : params.repeat_last_n;
-    // const float   repeat_penalty  = params.repeat_penalty;
-    // const float   alpha_presence  = params.presence_penalty;
-    // const float   alpha_frequency = params.frequency_penalty;
-    const int     mirostat        = sparams.mirostat;
-    const float   mirostat_tau    = sparams.mirostat_tau;
-    const float   mirostat_eta    = sparams.mirostat_eta;
-    // const bool    penalize_nl     = params.penalize_nl;
-
-    llama_token id = 0;
-    {
-        auto logits  = llama_get_logits(ctx);
-        auto n_vocab = llama_n_vocab(llama_get_model(ctx));
-
-        // Apply params.logit_bias map
-        for (auto it = sparams.logit_bias.begin(); it != sparams.logit_bias.end(); it++) {
-            logits[it->first] += it->second;
-        }
-
-        std::vector<llama_token_data> candidates;
-        candidates.reserve(n_vocab);
-        for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
-            candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
-        }
-
-        llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
-
-        // TODO: Apply penalties
-        // float nl_logit = logits[llama_token_nl(ctx)];
-        // auto last_n_repeat = std::min(std::min((int)last_n_tokens.size(), repeat_last_n), n_ctx);
-        // llama_sample_repetition_penalty(ctx, &candidates_p,
-        //      last_n_tokens.data() + last_n_tokens.size() - last_n_repeat,
-        //      last_n_repeat, repeat_penalty);
-        // llama_sample_frequency_and_presence_penalties(ctx, &candidates_p,
-        // last_n_tokens.data() + last_n_tokens.size() - last_n_repeat,
-        // last_n_repeat, alpha_frequency, alpha_presence);
-        // if (!penalize_nl) {
-        //     logits[llama_token_nl(ctx)] = nl_logit;
-        // }
-
-        if (temp <= 0) {
-            // Greedy sampling
-            id = llama_sample_token_greedy(ctx, &candidates_p);
-        } else {
-            if (mirostat == 1) {
-                static float mirostat_mu = 2.0f * mirostat_tau;
-                const int mirostat_m = 100;
-                llama_sample_temp(ctx, &candidates_p, temp);
-                id = llama_sample_token_mirostat(ctx, &candidates_p, mirostat_tau, mirostat_eta, mirostat_m, &mirostat_mu);
-            } else if (mirostat == 2) {
-                static float mirostat_mu = 2.0f * mirostat_tau;
-                llama_sample_temp(ctx, &candidates_p, temp);
-                id = llama_sample_token_mirostat_v2(ctx, &candidates_p, mirostat_tau, mirostat_eta, &mirostat_mu);
-            } else {
-                // Temperature sampling
-                llama_sample_top_k(ctx, &candidates_p, top_k, 1);
-                llama_sample_tail_free(ctx, &candidates_p, tfs_z, 1);
-                llama_sample_typical(ctx, &candidates_p, typical_p, 1);
-                llama_sample_top_p(ctx, &candidates_p, top_p, 1);
-                llama_sample_temp(ctx, &candidates_p, temp);
-                id = llama_sample_token(ctx, &candidates_p);
-            }
-        }
-    }
-
-    return id;
-}
-
-const char * sampling(struct MyModel * mymodel) {
-    llama_context * ctx = mymodel->ctx;
-    int id = sampling_id(mymodel);
-    static std::string ret;
-    if (id == llama_token_eos(ctx)) {
-        ret = "</s>";
-    } else {
-        ret = llama_token_to_piece(ctx, id);
-    }
-    eval_id(mymodel, id);
-    return ret.c_str();
-}
-
-}
diff --git a/examples/embd-input/embd-input-test.cpp b/examples/embd-input/embd-input-test.cpp
deleted file mode 100644
index dc4a0e488..000000000
--- a/examples/embd-input/embd-input-test.cpp
+++ /dev/null
@@ -1,35 +0,0 @@
-#include "embd-input.h"
-#include <stdlib.h>
-#include <random>
-#include <string.h>
-
-int main(int argc, char** argv) {
-
-    auto mymodel = create_mymodel(argc, argv);
-    int N = 10;
-    int max_tgt_len = 500;
-    int n_embd = llama_n_embd(llama_get_model(mymodel->ctx));
-
-    // add random float embd to test evaluation
-    float * data = new float[N*n_embd];
-    std::default_random_engine e;
-    std::uniform_real_distribution<float>  u(0,1);
-    for (int i=0;i<N*n_embd;i++) {
-        data[i] = u(e);
-    }
-
-    eval_string(mymodel, "user: what is the color of the flag of UN?");
-    eval_float(mymodel, data, N);
-    eval_string(mymodel, "assistant:");
-    eval_string(mymodel, mymodel->params.prompt.c_str());
-    const char* tmp;
-    for (int i=0; i<max_tgt_len; i++) {
-        tmp = sampling(mymodel);
-        if (strcmp(tmp, "</s>")==0) break;
-        printf("%s", tmp);
-        fflush(stdout);
-    }
-    printf("\n");
-    free_mymodel(mymodel);
-    return 0;
-}
diff --git a/examples/embd-input/embd-input.h b/examples/embd-input/embd-input.h
deleted file mode 100644
index eff5e3b84..000000000
--- a/examples/embd-input/embd-input.h
+++ /dev/null
@@ -1,27 +0,0 @@
-#ifndef _EMBD_INPUT_H_
-#define _EMBD_INPUT_H_ 1
-
-#include "common.h"
-#include "llama.h"
-
-extern "C" {
-
-typedef struct MyModel {
-    llama_context* ctx;
-    gpt_params params;
-    int n_past = 0;
-} MyModel;
-
-struct MyModel* create_mymodel(int argc, char ** argv);
-
-bool eval_float(void* model, float* input, int N);
-bool eval_tokens(void* model, std::vector<llama_token> tokens);
-bool eval_id(struct MyModel* mymodel, int id);
-bool eval_string(struct MyModel* mymodel, const char* str);
-const char * sampling(struct MyModel* mymodel);
-llama_token sampling_id(struct MyModel* mymodel);
-void free_mymodel(struct MyModel* mymodel);
-
-}
-
-#endif
diff --git a/examples/embd-input/embd_input.py b/examples/embd-input/embd_input.py
deleted file mode 100755
index f146acdc1..000000000
--- a/examples/embd-input/embd_input.py
+++ /dev/null
@@ -1,72 +0,0 @@
-#!/usr/bin/env python3
-import ctypes
-from ctypes import cdll, c_char_p, c_void_p, POINTER, c_float, c_int
-import numpy as np
-import os
-
-libc = cdll.LoadLibrary("./libembdinput.so")
-libc.sampling.restype=c_char_p
-libc.create_mymodel.restype=c_void_p
-libc.eval_string.argtypes=[c_void_p, c_char_p]
-libc.sampling.argtypes=[c_void_p]
-libc.eval_float.argtypes=[c_void_p, POINTER(c_float), c_int]
-
-
-class MyModel:
-    def __init__(self, args):
-        argc = len(args)
-        c_str = [c_char_p(i.encode()) for i in args]
-        args_c = (c_char_p * argc)(*c_str)
-        self.model = c_void_p(libc.create_mymodel(argc, args_c))
-        self.max_tgt_len = 512
-        self.print_string_eval = True
-
-    def __del__(self):
-        libc.free_mymodel(self.model)
-
-    def eval_float(self, x):
-        libc.eval_float(self.model, x.astype(np.float32).ctypes.data_as(POINTER(c_float)), x.shape[1])
-
-    def eval_string(self, x):
-        libc.eval_string(self.model, x.encode()) # c_char_p(x.encode()))
-        if self.print_string_eval:
-            print(x)
-
-    def eval_token(self, x):
-        libc.eval_id(self.model, x)
-
-    def sampling(self):
-        s = libc.sampling(self.model)
-        return s
-
-    def stream_generate(self, end="</s>"):
-        ret = b""
-        end = end.encode()
-        for _ in range(self.max_tgt_len):
-            tmp = self.sampling()
-            ret += tmp
-            yield tmp
-            if ret.endswith(end):
-                break
-
-    def generate_with_print(self, end="</s>"):
-        ret = b""
-        for i in self.stream_generate(end=end):
-            ret += i
-            print(i.decode(errors="replace"), end="", flush=True)
-        print("")
-        return ret.decode(errors="replace")
-
-
-    def generate(self, end="</s>"):
-        text = b"".join(self.stream_generate(end=end))
-        return text.decode(errors="replace")
-
-if __name__ == "__main__":
-    model = MyModel(["main", "--model", "../llama.cpp/models/ggml-vic13b-q4_1.bin", "-c", "2048"])
-    model.eval_string("""user: what is the color of the flag of UN?""")
-    x = np.random.random((5120,10))# , dtype=np.float32)
-    model.eval_float(x)
-    model.eval_string("""assistant:""")
-    for i in model.generate():
-        print(i.decode(errors="replace"), end="", flush=True)
diff --git a/examples/embd-input/llava.py b/examples/embd-input/llava.py
deleted file mode 100755
index 06fad55f4..000000000
--- a/examples/embd-input/llava.py
+++ /dev/null
@@ -1,71 +0,0 @@
-#!/usr/bin/env python3
-import sys
-import os
-sys.path.insert(0, os.path.dirname(__file__))
-from embd_input import MyModel
-import numpy as np
-from torch import nn
-import torch
-from transformers import CLIPVisionModel,  CLIPImageProcessor
-from PIL import Image
-
-# model parameters from 'liuhaotian/LLaVA-13b-delta-v1-1'
-vision_tower = "openai/clip-vit-large-patch14"
-select_hidden_state_layer = -2
-# (vision_config.image_size // vision_config.patch_size) ** 2
-image_token_len = (224//14)**2
-
-class Llava:
-    def __init__(self, args):
-        self.image_processor = CLIPImageProcessor.from_pretrained(vision_tower)
-        self.vision_tower = CLIPVisionModel.from_pretrained(vision_tower)
-        self.mm_projector = nn.Linear(1024, 5120)
-        self.model = MyModel(["main", *args])
-
-    def load_projection(self, path):
-        state = torch.load(path)
-        self.mm_projector.load_state_dict({
-            "weight": state["model.mm_projector.weight"],
-            "bias": state["model.mm_projector.bias"]})
-
-    def chat(self, question):
-        self.model.eval_string("user: ")
-        self.model.eval_string(question)
-        self.model.eval_string("\nassistant: ")
-        return self.model.generate_with_print()
-
-    def chat_with_image(self, image, question):
-        with torch.no_grad():
-            embd_image = self.image_processor.preprocess(image, return_tensors='pt')['pixel_values'][0]
-            image_forward_out = self.vision_tower(embd_image.unsqueeze(0), output_hidden_states=True)
-            select_hidden_state = image_forward_out.hidden_states[select_hidden_state_layer]
-            image_feature = select_hidden_state[:, 1:]
-            embd_image = self.mm_projector(image_feature)
-            embd_image = embd_image.cpu().numpy()[0]
-        self.model.eval_string("user: ")
-        self.model.eval_token(32003-2) # im_start
-        self.model.eval_float(embd_image.T)
-        for i in range(image_token_len-embd_image.shape[0]):
-            self.model.eval_token(32003-3) # im_patch
-        self.model.eval_token(32003-1) # im_end
-        self.model.eval_string(question)
-        self.model.eval_string("\nassistant: ")
-        return self.model.generate_with_print()
-
-
-if __name__=="__main__":
-    # model form liuhaotian/LLaVA-13b-delta-v1-1
-    a = Llava(["--model", "./models/ggml-llava-13b-v1.1.bin", "-c", "2048"])
-    # Extract from https://huggingface.co/liuhaotian/LLaVA-13b-delta-v1-1/blob/main/pytorch_model-00003-of-00003.bin.
-    # Also here can use pytorch_model-00003-of-00003.bin directly.
-    a.load_projection(os.path.join(
-        os.path.dirname(__file__) ,
-        "llava_projection.pth"))
-    respose = a.chat_with_image(
-        Image.open("./media/llama1-logo.png").convert('RGB'),
-        "what is the text in the picture?")
-    respose
-    a.chat("what is the color of it?")
-
-
-
diff --git a/examples/embd-input/minigpt4.py b/examples/embd-input/minigpt4.py
deleted file mode 100755
index 7b13e4a5c..000000000
--- a/examples/embd-input/minigpt4.py
+++ /dev/null
@@ -1,129 +0,0 @@
-#!/usr/bin/env python3
-import sys
-import os
-sys.path.insert(0, os.path.dirname(__file__))
-from embd_input import MyModel
-import numpy as np
-from torch import nn
-import torch
-from PIL import Image
-
-minigpt4_path = os.path.join(os.path.dirname(__file__), "MiniGPT-4")
-sys.path.insert(0, minigpt4_path)
-from minigpt4.models.blip2 import Blip2Base
-from minigpt4.processors.blip_processors import Blip2ImageEvalProcessor
-
-
-class MiniGPT4(Blip2Base):
-    """
-    MiniGPT4 model from https://github.com/Vision-CAIR/MiniGPT-4
-    """
-    def __init__(self,
-        args,
-        vit_model="eva_clip_g",
-        q_former_model="https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained_flant5xxl.pth",
-        img_size=224,
-        drop_path_rate=0,
-        use_grad_checkpoint=False,
-        vit_precision="fp32",
-        freeze_vit=True,
-        freeze_qformer=True,
-        num_query_token=32,
-        llama_model="",
-        prompt_path="",
-        prompt_template="",
-        max_txt_len=32,
-        end_sym='\n',
-        low_resource=False,  # use 8 bit and put vit in cpu
-        device_8bit=0
-    ):
-        super().__init__()
-        self.img_size = img_size
-        self.low_resource = low_resource
-        self.preprocessor = Blip2ImageEvalProcessor(img_size)
-
-        print('Loading VIT')
-        self.visual_encoder, self.ln_vision = self.init_vision_encoder(
-            vit_model, img_size, drop_path_rate, use_grad_checkpoint, vit_precision
-        )
-        print('Loading VIT Done')
-        print('Loading Q-Former')
-        self.Qformer, self.query_tokens = self.init_Qformer(
-            num_query_token, self.visual_encoder.num_features
-        )
-        self.Qformer.cls = None
-        self.Qformer.bert.embeddings.word_embeddings = None
-        self.Qformer.bert.embeddings.position_embeddings = None
-        for layer in self.Qformer.bert.encoder.layer:
-            layer.output = None
-            layer.intermediate = None
-        self.load_from_pretrained(url_or_filename=q_former_model)
-        print('Loading Q-Former Done')
-        self.llama_proj = nn.Linear(
-            self.Qformer.config.hidden_size, 5120 # self.llama_model.config.hidden_size
-        )
-        self.max_txt_len = max_txt_len
-        self.end_sym = end_sym
-        self.model = MyModel(["main", *args])
-        # system prompt
-        self.model.eval_string("Give the following image: <Img>ImageContent</Img>. "
-           "You will be able to see the image once I provide it to you. Please answer my questions."
-           "###")
-
-    def encode_img(self, image):
-        image = self.preprocessor(image)
-        image = image.unsqueeze(0)
-        device = image.device
-        if self.low_resource:
-            self.vit_to_cpu()
-            image = image.to("cpu")
-
-        with self.maybe_autocast():
-            image_embeds = self.ln_vision(self.visual_encoder(image)).to(device)
-            image_atts = torch.ones(image_embeds.size()[:-1], dtype=torch.long).to(device)
-
-            query_tokens = self.query_tokens.expand(image_embeds.shape[0], -1, -1)
-            query_output = self.Qformer.bert(
-                query_embeds=query_tokens,
-                encoder_hidden_states=image_embeds,
-                encoder_attention_mask=image_atts,
-                return_dict=True,
-            )
-
-            inputs_llama = self.llama_proj(query_output.last_hidden_state)
-            # atts_llama = torch.ones(inputs_llama.size()[:-1], dtype=torch.long).to(image.device)
-        return inputs_llama
-
-    def load_projection(self, path):
-        state = torch.load(path)["model"]
-        self.llama_proj.load_state_dict({
-            "weight": state["llama_proj.weight"],
-            "bias": state["llama_proj.bias"]})
-
-    def chat(self, question):
-        self.model.eval_string("Human: ")
-        self.model.eval_string(question)
-        self.model.eval_string("\n### Assistant:")
-        return self.model.generate_with_print(end="###")
-
-    def chat_with_image(self, image, question):
-        with torch.no_grad():
-            embd_image = self.encode_img(image)
-        embd_image = embd_image.cpu().numpy()[0]
-        self.model.eval_string("Human: <Img>")
-        self.model.eval_float(embd_image.T)
-        self.model.eval_string("</Img> ")
-        self.model.eval_string(question)
-        self.model.eval_string("\n### Assistant:")
-        return self.model.generate_with_print(end="###")
-
-
-if __name__=="__main__":
-    a = MiniGPT4(["--model", "./models/ggml-vicuna-13b-v0-q4_1.bin", "-c", "2048"])
-    a.load_projection(os.path.join(
-        os.path.dirname(__file__) ,
-        "pretrained_minigpt4.pth"))
-    respose = a.chat_with_image(
-        Image.open("./media/llama1-logo.png").convert('RGB'),
-        "what is the text in the picture?")
-    a.chat("what is the color of it?")
diff --git a/examples/embd-input/panda_gpt.py b/examples/embd-input/panda_gpt.py
deleted file mode 100755
index 891ad7cc9..000000000
--- a/examples/embd-input/panda_gpt.py
+++ /dev/null
@@ -1,99 +0,0 @@
-#!/usr/bin/env python3
-import sys
-import os
-sys.path.insert(0, os.path.dirname(__file__))
-from embd_input import MyModel
-import numpy as np
-from torch import nn
-import torch
-
-# use PandaGPT path
-panda_gpt_path = os.path.join(os.path.dirname(__file__), "PandaGPT")
-imagebind_ckpt_path = "./models/panda_gpt/"
-
-sys.path.insert(0, os.path.join(panda_gpt_path,"code","model"))
-from ImageBind.models import imagebind_model
-from ImageBind import data
-
-ModalityType = imagebind_model.ModalityType
-max_tgt_len = 400
-
-class PandaGPT:
-    def __init__(self, args):
-        self.visual_encoder,_ = imagebind_model.imagebind_huge(pretrained=True, store_path=imagebind_ckpt_path)
-        self.visual_encoder.eval()
-        self.llama_proj = nn.Linear(1024, 5120) # self.visual_hidden_size, 5120)
-        self.max_tgt_len = max_tgt_len
-        self.model = MyModel(["main", *args])
-        self.generated_text = ""
-        self.device = "cpu"
-
-    def load_projection(self, path):
-        state = torch.load(path, map_location="cpu")
-        self.llama_proj.load_state_dict({
-            "weight": state["llama_proj.weight"],
-            "bias": state["llama_proj.bias"]})
-
-    def eval_inputs(self, inputs):
-        self.model.eval_string("<Img>")
-        embds = self.extract_multimoal_feature(inputs)
-        for i in embds:
-            self.model.eval_float(i.T)
-        self.model.eval_string("</Img> ")
-
-    def chat(self, question):
-        return self.chat_with_image(None, question)
-
-    def chat_with_image(self, inputs, question):
-        if self.generated_text == "":
-            self.model.eval_string("###")
-        self.model.eval_string(" Human: ")
-        if inputs:
-            self.eval_inputs(inputs)
-        self.model.eval_string(question)
-        self.model.eval_string("\n### Assistant:")
-        ret = self.model.generate_with_print(end="###")
-        self.generated_text += ret
-        return ret
-
-    def extract_multimoal_feature(self, inputs):
-        features = []
-        for key in ["image", "audio", "video", "thermal"]:
-            if key + "_paths" in inputs:
-                embeds = self.encode_data(key, inputs[key+"_paths"])
-                features.append(embeds)
-        return features
-
-    def encode_data(self, data_type, data_paths):
-
-        type_map = {
-            "image": ModalityType.VISION,
-            "audio": ModalityType.AUDIO,
-            "video": ModalityType.VISION,
-            "thermal": ModalityType.THERMAL,
-        }
-        load_map = {
-            "image": data.load_and_transform_vision_data,
-            "audio": data.load_and_transform_audio_data,
-            "video": data.load_and_transform_video_data,
-            "thermal": data.load_and_transform_thermal_data
-        }
-
-        load_function = load_map[data_type]
-        key = type_map[data_type]
-
-        inputs = {key: load_function(data_paths, self.device)}
-        with torch.no_grad():
-            embeddings = self.visual_encoder(inputs)
-            embeds = embeddings[key]
-            embeds = self.llama_proj(embeds).cpu().numpy()
-        return embeds
-
-
-if __name__=="__main__":
-    a = PandaGPT(["--model", "./models/ggml-vicuna-13b-v0-q4_1.bin", "-c", "2048", "--lora", "./models/panda_gpt/ggml-adapter-model.bin","--temp", "0"])
-    a.load_projection("./models/panda_gpt/adapter_model.bin")
-    a.chat_with_image(
-        {"image_paths": ["./media/llama1-logo.png"]},
-        "what is the text in the picture? 'llama' or 'lambda'?")
-    a.chat("what is the color of it?")
diff --git a/examples/embedding/CMakeLists.txt b/examples/embedding/CMakeLists.txt
index 0c752c7bb..8ffc33868 100644
--- a/examples/embedding/CMakeLists.txt
+++ b/examples/embedding/CMakeLists.txt
@@ -3,6 +3,3 @@ add_executable(${TARGET} embedding.cpp)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
-if(TARGET BUILD_INFO)
-  add_dependencies(${TARGET} BUILD_INFO)
-endif()
diff --git a/examples/embedding/embedding.cpp b/examples/embedding/embedding.cpp
index 14075609e..3295cd240 100644
--- a/examples/embedding/embedding.cpp
+++ b/examples/embedding/embedding.cpp
@@ -1,4 +1,3 @@
-#include "build-info.h"
 #include "common.h"
 #include "llama.h"
 
diff --git a/examples/finetune/finetune.cpp b/examples/finetune/finetune.cpp
index 9ae4bc198..649a3b7c1 100644
--- a/examples/finetune/finetune.cpp
+++ b/examples/finetune/finetune.cpp
@@ -529,13 +529,14 @@ static void init_lora(const struct my_llama_model * model, struct my_llama_lora
     set_param_lora(lora);
 
     // measure data size
-    struct ggml_allocr * alloc = NULL;
-    alloc = ggml_allocr_new_measure(tensor_alignment);
-    alloc_lora(alloc, lora);
+    size_t size = 0;
+    for (struct ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
+        size += GGML_PAD(ggml_nbytes(t), tensor_alignment);
+    }
 
     // allocate data
-    lora->data.resize(ggml_allocr_max_size(alloc) + tensor_alignment);
-    ggml_allocr_free(alloc);
+    struct ggml_allocr * alloc = NULL;
+    lora->data.resize(size + tensor_alignment);
     alloc = ggml_allocr_new(lora->data.data(), lora->data.size(), tensor_alignment);
     alloc_lora(alloc, lora);
     ggml_allocr_free(alloc);
@@ -641,8 +642,9 @@ static struct ggml_tensor * llama_build_lora_finetune_graphs(
         const int rope_mode = 0;
 
         return ggml_rope_custom(ctx,
-            t, KQ_pos, n_rot, rope_mode, n_ctx,
-            rope_freq_base, rope_freq_scale);
+            t, KQ_pos, n_rot, rope_mode, n_ctx, 0,
+            rope_freq_base, rope_freq_scale, 0.0f, 0.0f, 0.0f, 0.0f
+        );
     };
 
     set_name(tokens_input, "tokens_input");
@@ -651,7 +653,7 @@ static struct ggml_tensor * llama_build_lora_finetune_graphs(
     GGML_ASSERT(tokens_input->type == GGML_TYPE_I32);
 
     auto add_to_f32 = [] (struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b) {
-        if (ggml_is_quantized(a->type)) {
+        if (ggml_is_quantized(a->type) || a->type == GGML_TYPE_F16) {
             return ggml_add_cast(ctx, a, b, GGML_TYPE_F32);
         } else if (a->type == GGML_TYPE_F32) {
             return ggml_add(ctx, a, b);
@@ -1458,6 +1460,17 @@ static bool train_params_parse(int argc, char ** argv, struct train_params * par
             }
             params->n_rank_w3 = std::stoi(argv[i]);
             params->custom_n_rank_w3 = true;
+        } else if (arg == "--gpu-layers" || arg == "-ngl" || arg == "--n-gpu-layers") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+#ifdef LLAMA_SUPPORTS_GPU_OFFLOAD
+            params->common.n_gpu_layers = std::stoi(argv[i]);
+#else
+            fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers option will be ignored\n");
+            fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
+#endif
         } else {
             fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
             train_print_usage(argc, argv, &default_params);
@@ -1544,6 +1557,7 @@ int main(int argc, char ** argv) {
     srand(params.common.seed);
 
     struct llama_model_params llama_mparams = llama_model_default_params();
+    llama_mparams.n_gpu_layers = params.common.n_gpu_layers;
     llama_mparams.vocab_only = false;
 
     printf("%s: model base = '%s'\n", __func__, params.fn_model_base);
@@ -1714,11 +1728,9 @@ int main(int argc, char ** argv) {
     struct ggml_tensor * target_probs  = ggml_new_tensor_3d(ctx_input, GGML_TYPE_F32, n_vocab,  n_tokens, n_batch);
 
     // measure required memory for input tensors
-    alloc = ggml_allocr_new_measure(tensor_alignment);
-    ggml_allocr_alloc(alloc, tokens_input);
-    ggml_allocr_alloc(alloc, target_probs);
-    size_t max_input_size = ggml_allocr_max_size(alloc) + tensor_alignment;
-    ggml_allocr_free(alloc);
+    size_t max_input_size = GGML_PAD(ggml_nbytes(tokens_input), tensor_alignment) +
+                            GGML_PAD(ggml_nbytes(target_probs), tensor_alignment) +
+                            tensor_alignment;
     printf("%s: input_size = %zu bytes (%.1f MB)\n", __func__, max_input_size, (float) max_input_size / (1024.0f*1024.0f));
 
     // allocate input tensors
diff --git a/examples/finetune/finetune.sh b/examples/finetune/finetune.sh
new file mode 100644
index 000000000..079bfa113
--- /dev/null
+++ b/examples/finetune/finetune.sh
@@ -0,0 +1,34 @@
+#!/bin/bash
+cd `dirname $0`
+cd ../..
+
+EXE="./finetune"
+
+if [[ ! $LLAMA_MODEL_DIR ]]; then LLAMA_MODEL_DIR="./models"; fi
+if [[ ! $LLAMA_TRAINING_DIR ]]; then LLAMA_TRAINING_DIR="."; fi
+
+# MODEL="$LLAMA_MODEL_DIR/openllama-3b-v2-q8_0.gguf" # This is the model the readme uses.
+MODEL="$LLAMA_MODEL_DIR/openllama-3b-v2.gguf" # An f16 model. Note in this case with "-g", you get an f32-format .BIN file that isn't yet supported if you use it with "main --lora" with GPU inferencing.
+
+while getopts "dg" opt; do
+  case $opt in
+    d)
+      DEBUGGER="gdb --args"
+      ;;
+    g)
+      EXE="./build/bin/Release/finetune"
+      GPUARG="--gpu-layers 25"
+      ;;
+  esac
+done
+
+$DEBUGGER $EXE \
+        --model-base $MODEL \
+        $GPUARG \
+        --checkpoint-in  chk-ol3b-shakespeare-LATEST.gguf \
+        --checkpoint-out chk-ol3b-shakespeare-ITERATION.gguf \
+        --lora-out lora-ol3b-shakespeare-ITERATION.bin \
+        --train-data "$LLAMA_TRAINING_DIR\shakespeare.txt" \
+        --save-every 10 \
+        --threads 10 --adam-iter 30 --batch 4 --ctx 64 \
+        --use-checkpointing
diff --git a/examples/gptneox-wip/cmpnct_gpt2bpe.hpp b/examples/gptneox-wip/cmpnct_gpt2bpe.hpp
deleted file mode 100644
index 9d433f4b1..000000000
--- a/examples/gptneox-wip/cmpnct_gpt2bpe.hpp
+++ /dev/null
@@ -1,1133 +0,0 @@
-#ifndef CMPNCT_GPT2BPE
-#define CMPNCT_GPT2BPE
-
-#include <vector>
-#include <string>
-#include <algorithm>
-#include <utility>
-#include <iostream>
-#include <map>
-#include <unordered_map>
-#include <queue>
-#include <cstring>
-
-
-// Unicode GPT2 Byte Pair Encoding Tokenizer
-// Adapted from https://github.com/cmp-nct/ggllm.cpp [MIT License]
-// Removed loading of merges from HF json and parts made for a specific vocab
-
-
-//-----------------
-// Unicode library (from cmpnct_unicode.cpp)
-//-----------------
-
-// Minimal library for high performance handling and categorization of UTF8 strings and characters
-// Using std::string
-
-enum CNCTCharType {
-    DIGIT,          // a numerical char in any language
-    LETTER,         // a letter in any language
-    WHITESPACE,     // any form of whitespace
-    ACCENT_MARK,    // letter modifiers like ´ in é
-    PUNCTUATION,    // punctuation including brackets
-    SYMBOL,         // math, currency, other symbols
-    CONTROL,        // control characters
-    MIXED,          // a mix of the above
-    UNIDENTIFIED    // something more exotic like emoji or separators
-};
-
-struct CNCTUnicode;
-
-struct CNCTString {
-    std::string str;
-    size_t utf8_chars;
-
-    CNCTCharType char_type=UNIDENTIFIED;
-    bool is_sequential=false;
-
-    size_t seq_offset_bytes=0;
-    size_t seq_offset_utf8_chars=0;
-
-    bool operator==(const std::string &other) const;
-    bool operator==(const char other) const;
-    bool operator==(const CNCTString &other) const;
-    CNCTString &operator+=(const std::string &other);
-    CNCTString &operator+=(const char other);
-    friend CNCTString operator+(CNCTString lhs, const std::string &rhs);
-    friend CNCTString operator+(CNCTString lhs, const char rhs);
-    CNCTString& operator+=(const CNCTString& other);
-    friend CNCTString operator+(CNCTString lhs, const CNCTString& rhs);
-};
-
-struct CNCTUnicode {
-    static bool check_code_range(int c, const std::vector<std::pair<int, int>>& ranges);
-    static CNCTCharType get_code_type(int c);
-    static CNCTCharType get_code_type(const std::string &utf8_char);
-    static int utf8_len(const char c);
-    static int strlen_utf8(std::string src);
-    static std::vector<std::string> split_utf8(const std::string &src);
-    static std::vector<CNCTString> split_utf8_enhanced(const std::string &src);
-    static CNCTCharType string_identify(const std::string& str);
-    static bool string_test(const std::string& str, CNCTCharType chartype);
-};
-
-static const std::vector<std::pair<int, int>> digit_ranges = {
-{0x30, 0x39}, {0xB2, 0xB3}, {0xB9, 0xB9}, {0x660, 0x669}, {0x6F0, 0x6F9}, {0x7C0, 0x7C9}, {0x966, 0x96F}, {0x9E6, 0x9EF}, {0xA66, 0xA6F}, {0xAE6, 0xAEF}, {0xB66, 0xB6F}, {0xBE6, 0xBEF}, {0xC66, 0xC6F},
-{0xCE6, 0xCEF}, {0xD66, 0xD6F}, {0xDE6, 0xDEF}, {0xE50, 0xE59}, {0xED0, 0xED9}, {0xF20, 0xF29}, {0x1040, 0x1049}, {0x1090, 0x1099}, {0x1369, 0x1371}, {0x17E0, 0x17E9}, {0x1810, 0x1819}, {0x1946, 0x194F},
-{0x19D0, 0x19DA}, {0x1A80, 0x1A89}, {0x1A90, 0x1A99}, {0x1B50, 0x1B59}, {0x1BB0, 0x1BB9}, {0x1C40, 0x1C49}, {0x1C50, 0x1C59}, {0x2070, 0x2070}, {0x2074, 0x2079}, {0x2080, 0x2089}, {0x2460, 0x2468},
-{0x2474, 0x247C}, {0x2488, 0x2490}, {0x24EA, 0x24EA}, {0x24F5, 0x24FD}, {0x24FF, 0x24FF}, {0x2776, 0x277E}, {0x2780, 0x2788}, {0x278A, 0x2792}, {0xA620, 0xA629}, {0xA8D0, 0xA8D9}, {0xA900, 0xA909},
-{0xA9D0, 0xA9D9}, {0xA9F0, 0xA9F9}, {0xAA50, 0xAA59}, {0xABF0, 0xABF9}, {0xFF10, 0xFF19}, {0x104A0, 0x104A9}, {0x10A40, 0x10A43}, {0x10D30, 0x10D39}, {0x10E60, 0x10E68}, {0x11052, 0x1105A},
-{0x11066, 0x1106F}, {0x110F0, 0x110F9}, {0x11136, 0x1113F}, {0x111D0, 0x111D9}, {0x112F0, 0x112F9}, {0x11450, 0x11459}, {0x114D0, 0x114D9}, {0x11650, 0x11659}, {0x116C0, 0x116C9}, {0x11730, 0x11739},
-{0x118E0, 0x118E9}, {0x11950, 0x11959}, {0x11C50, 0x11C59}, {0x11D50, 0x11D59}, {0x11DA0, 0x11DA9}, {0x16A60, 0x16A69}, {0x16B50, 0x16B59}, {0x1D7CE, 0x1D7FF}, {0x1E140, 0x1E149}, {0x1E2F0, 0x1E2F9},
-{0x1E950, 0x1E959}, {0x1F100, 0x1F10A}, {0x1FBF0, 0x1FBF9},
-};
-
-static const std::vector<std::pair<int, int>> letter_ranges = {
-{0x41, 0x5A}, {0x61, 0x7A}, {0xAA, 0xAA}, {0xB5, 0xB5}, {0xBA, 0xBA}, {0xC0, 0xD6}, {0xD8, 0xF6}, {0xF8, 0x2C1}, {0x2C6, 0x2D1}, {0x2E0, 0x2E4}, {0x2EC, 0x2EC}, {0x2EE, 0x2EE}, {0x370, 0x374},
-{0x376, 0x377}, {0x37A, 0x37D}, {0x37F, 0x37F}, {0x386, 0x386}, {0x388, 0x38A}, {0x38C, 0x38C}, {0x38E, 0x3A1}, {0x3A3, 0x3F5}, {0x3F7, 0x481}, {0x48A, 0x52F}, {0x531, 0x556}, {0x559, 0x559},
-{0x560, 0x588}, {0x5D0, 0x5EA}, {0x5EF, 0x5F2}, {0x620, 0x64A}, {0x66E, 0x66F}, {0x671, 0x6D3}, {0x6D5, 0x6D5}, {0x6E5, 0x6E6}, {0x6EE, 0x6EF}, {0x6FA, 0x6FC}, {0x6FF, 0x6FF}, {0x710, 0x710},
-{0x712, 0x72F}, {0x74D, 0x7A5}, {0x7B1, 0x7B1}, {0x7CA, 0x7EA}, {0x7F4, 0x7F5}, {0x7FA, 0x7FA}, {0x800, 0x815}, {0x81A, 0x81A}, {0x824, 0x824}, {0x828, 0x828}, {0x840, 0x858}, {0x860, 0x86A},
-{0x8A0, 0x8B4}, {0x8B6, 0x8C7}, {0x904, 0x939}, {0x93D, 0x93D}, {0x950, 0x950}, {0x958, 0x961}, {0x971, 0x980}, {0x985, 0x98C}, {0x98F, 0x990}, {0x993, 0x9A8}, {0x9AA, 0x9B0}, {0x9B2, 0x9B2},
-{0x9B6, 0x9B9}, {0x9BD, 0x9BD}, {0x9CE, 0x9CE}, {0x9DC, 0x9DD}, {0x9DF, 0x9E1}, {0x9F0, 0x9F1}, {0x9FC, 0x9FC}, {0xA05, 0xA0A}, {0xA0F, 0xA10}, {0xA13, 0xA28}, {0xA2A, 0xA30}, {0xA32, 0xA33},
-{0xA35, 0xA36}, {0xA38, 0xA39}, {0xA59, 0xA5C}, {0xA5E, 0xA5E}, {0xA72, 0xA74}, {0xA85, 0xA8D}, {0xA8F, 0xA91}, {0xA93, 0xAA8}, {0xAAA, 0xAB0}, {0xAB2, 0xAB3}, {0xAB5, 0xAB9}, {0xABD, 0xABD},
-{0xAD0, 0xAD0}, {0xAE0, 0xAE1}, {0xAF9, 0xAF9}, {0xB05, 0xB0C}, {0xB0F, 0xB10}, {0xB13, 0xB28}, {0xB2A, 0xB30}, {0xB32, 0xB33}, {0xB35, 0xB39}, {0xB3D, 0xB3D}, {0xB5C, 0xB5D}, {0xB5F, 0xB61},
-{0xB71, 0xB71}, {0xB83, 0xB83}, {0xB85, 0xB8A}, {0xB8E, 0xB90}, {0xB92, 0xB95}, {0xB99, 0xB9A}, {0xB9C, 0xB9C}, {0xB9E, 0xB9F}, {0xBA3, 0xBA4}, {0xBA8, 0xBAA}, {0xBAE, 0xBB9}, {0xBD0, 0xBD0},
-{0xC05, 0xC0C}, {0xC0E, 0xC10}, {0xC12, 0xC28}, {0xC2A, 0xC39}, {0xC3D, 0xC3D}, {0xC58, 0xC5A}, {0xC60, 0xC61}, {0xC80, 0xC80}, {0xC85, 0xC8C}, {0xC8E, 0xC90}, {0xC92, 0xCA8}, {0xCAA, 0xCB3},
-{0xCB5, 0xCB9}, {0xCBD, 0xCBD}, {0xCDE, 0xCDE}, {0xCE0, 0xCE1}, {0xCF1, 0xCF2}, {0xD04, 0xD0C}, {0xD0E, 0xD10}, {0xD12, 0xD3A}, {0xD3D, 0xD3D}, {0xD4E, 0xD4E}, {0xD54, 0xD56}, {0xD5F, 0xD61},
-{0xD7A, 0xD7F}, {0xD85, 0xD96}, {0xD9A, 0xDB1}, {0xDB3, 0xDBB}, {0xDBD, 0xDBD}, {0xDC0, 0xDC6}, {0xE01, 0xE30}, {0xE32, 0xE33}, {0xE40, 0xE46}, {0xE81, 0xE82}, {0xE84, 0xE84}, {0xE86, 0xE8A},
-{0xE8C, 0xEA3}, {0xEA5, 0xEA5}, {0xEA7, 0xEB0}, {0xEB2, 0xEB3}, {0xEBD, 0xEBD}, {0xEC0, 0xEC4}, {0xEC6, 0xEC6}, {0xEDC, 0xEDF}, {0xF00, 0xF00}, {0xF40, 0xF47}, {0xF49, 0xF6C}, {0xF88, 0xF8C},
-{0x1000, 0x102A}, {0x103F, 0x103F}, {0x1050, 0x1055}, {0x105A, 0x105D}, {0x1061, 0x1061}, {0x1065, 0x1066}, {0x106E, 0x1070}, {0x1075, 0x1081}, {0x108E, 0x108E}, {0x10A0, 0x10C5}, {0x10C7, 0x10C7},
-{0x10CD, 0x10CD}, {0x10D0, 0x10FA}, {0x10FC, 0x1248}, {0x124A, 0x124D}, {0x1250, 0x1256}, {0x1258, 0x1258}, {0x125A, 0x125D}, {0x1260, 0x1288}, {0x128A, 0x128D}, {0x1290, 0x12B0}, {0x12B2, 0x12B5},
-{0x12B8, 0x12BE}, {0x12C0, 0x12C0}, {0x12C2, 0x12C5}, {0x12C8, 0x12D6}, {0x12D8, 0x1310}, {0x1312, 0x1315}, {0x1318, 0x135A}, {0x1380, 0x138F}, {0x13A0, 0x13F5}, {0x13F8, 0x13FD}, {0x1401, 0x166C},
-{0x166F, 0x167F}, {0x1681, 0x169A}, {0x16A0, 0x16EA}, {0x16F1, 0x16F8}, {0x1700, 0x170C}, {0x170E, 0x1711}, {0x1720, 0x1731}, {0x1740, 0x1751}, {0x1760, 0x176C}, {0x176E, 0x1770}, {0x1780, 0x17B3},
-{0x17D7, 0x17D7}, {0x17DC, 0x17DC}, {0x1820, 0x1878}, {0x1880, 0x1884}, {0x1887, 0x18A8}, {0x18AA, 0x18AA}, {0x18B0, 0x18F5}, {0x1900, 0x191E}, {0x1950, 0x196D}, {0x1970, 0x1974}, {0x1980, 0x19AB},
-{0x19B0, 0x19C9}, {0x1A00, 0x1A16}, {0x1A20, 0x1A54}, {0x1AA7, 0x1AA7}, {0x1B05, 0x1B33}, {0x1B45, 0x1B4B}, {0x1B83, 0x1BA0}, {0x1BAE, 0x1BAF}, {0x1BBA, 0x1BE5}, {0x1C00, 0x1C23}, {0x1C4D, 0x1C4F},
-{0x1C5A, 0x1C7D}, {0x1C80, 0x1C88}, {0x1C90, 0x1CBA}, {0x1CBD, 0x1CBF}, {0x1CE9, 0x1CEC}, {0x1CEE, 0x1CF3}, {0x1CF5, 0x1CF6}, {0x1CFA, 0x1CFA}, {0x1D00, 0x1DBF}, {0x1E00, 0x1F15}, {0x1F18, 0x1F1D},
-{0x1F20, 0x1F45}, {0x1F48, 0x1F4D}, {0x1F50, 0x1F57}, {0x1F59, 0x1F59}, {0x1F5B, 0x1F5B}, {0x1F5D, 0x1F5D}, {0x1F5F, 0x1F7D}, {0x1F80, 0x1FB4}, {0x1FB6, 0x1FBC}, {0x1FBE, 0x1FBE}, {0x1FC2, 0x1FC4},
-{0x1FC6, 0x1FCC}, {0x1FD0, 0x1FD3}, {0x1FD6, 0x1FDB}, {0x1FE0, 0x1FEC}, {0x1FF2, 0x1FF4}, {0x1FF6, 0x1FFC}, {0x2071, 0x2071}, {0x207F, 0x207F}, {0x2090, 0x209C}, {0x2102, 0x2102}, {0x2107, 0x2107},
-{0x210A, 0x2113}, {0x2115, 0x2115}, {0x2119, 0x211D}, {0x2124, 0x2124}, {0x2126, 0x2126}, {0x2128, 0x2128}, {0x212A, 0x212D}, {0x212F, 0x2139}, {0x213C, 0x213F}, {0x2145, 0x2149}, {0x214E, 0x214E},
-{0x2183, 0x2184}, {0x2C00, 0x2C2E}, {0x2C30, 0x2C5E}, {0x2C60, 0x2CE4}, {0x2CEB, 0x2CEE}, {0x2CF2, 0x2CF3}, {0x2D00, 0x2D25}, {0x2D27, 0x2D27}, {0x2D2D, 0x2D2D}, {0x2D30, 0x2D67}, {0x2D6F, 0x2D6F},
-{0x2D80, 0x2D96}, {0x2DA0, 0x2DA6}, {0x2DA8, 0x2DAE}, {0x2DB0, 0x2DB6}, {0x2DB8, 0x2DBE}, {0x2DC0, 0x2DC6}, {0x2DC8, 0x2DCE}, {0x2DD0, 0x2DD6}, {0x2DD8, 0x2DDE}, {0x2E2F, 0x2E2F}, {0x3005, 0x3006},
-{0x3031, 0x3035}, {0x303B, 0x303C}, {0x3041, 0x3096}, {0x309D, 0x309F}, {0x30A1, 0x30FA}, {0x30FC, 0x30FF}, {0x3105, 0x312F}, {0x3131, 0x318E}, {0x31A0, 0x31BF}, {0x31F0, 0x31FF}, {0x3400, 0x4DBF},
-{0x4E00, 0x9FFC}, {0xA000, 0xA48C}, {0xA4D0, 0xA4FD}, {0xA500, 0xA60C}, {0xA610, 0xA61F}, {0xA62A, 0xA62B}, {0xA640, 0xA66E}, {0xA67F, 0xA69D}, {0xA6A0, 0xA6E5}, {0xA717, 0xA71F}, {0xA722, 0xA788},
-{0xA78B, 0xA7BF}, {0xA7C2, 0xA7CA}, {0xA7F5, 0xA801}, {0xA803, 0xA805}, {0xA807, 0xA80A}, {0xA80C, 0xA822}, {0xA840, 0xA873}, {0xA882, 0xA8B3}, {0xA8F2, 0xA8F7}, {0xA8FB, 0xA8FB}, {0xA8FD, 0xA8FE},
-{0xA90A, 0xA925}, {0xA930, 0xA946}, {0xA960, 0xA97C}, {0xA984, 0xA9B2}, {0xA9CF, 0xA9CF}, {0xA9E0, 0xA9E4}, {0xA9E6, 0xA9EF}, {0xA9FA, 0xA9FE}, {0xAA00, 0xAA28}, {0xAA40, 0xAA42}, {0xAA44, 0xAA4B},
-{0xAA60, 0xAA76}, {0xAA7A, 0xAA7A}, {0xAA7E, 0xAAAF}, {0xAAB1, 0xAAB1}, {0xAAB5, 0xAAB6}, {0xAAB9, 0xAABD}, {0xAAC0, 0xAAC0}, {0xAAC2, 0xAAC2}, {0xAADB, 0xAADD}, {0xAAE0, 0xAAEA}, {0xAAF2, 0xAAF4},
-{0xAB01, 0xAB06}, {0xAB09, 0xAB0E}, {0xAB11, 0xAB16}, {0xAB20, 0xAB26}, {0xAB28, 0xAB2E}, {0xAB30, 0xAB5A}, {0xAB5C, 0xAB69}, {0xAB70, 0xABE2}, {0xAC00, 0xD7A3}, {0xD7B0, 0xD7C6}, {0xD7CB, 0xD7FB},
-{0xF900, 0xFA6D}, {0xFA70, 0xFAD9}, {0xFB00, 0xFB06}, {0xFB13, 0xFB17}, {0xFB1D, 0xFB1D}, {0xFB1F, 0xFB28}, {0xFB2A, 0xFB36}, {0xFB38, 0xFB3C}, {0xFB3E, 0xFB3E}, {0xFB40, 0xFB41}, {0xFB43, 0xFB44},
-{0xFB46, 0xFBB1}, {0xFBD3, 0xFD3D}, {0xFD50, 0xFD8F}, {0xFD92, 0xFDC7}, {0xFDF0, 0xFDFB}, {0xFE70, 0xFE74}, {0xFE76, 0xFEFC}, {0xFF21, 0xFF3A}, {0xFF41, 0xFF5A}, {0xFF66, 0xFFBE}, {0xFFC2, 0xFFC7},
-{0xFFCA, 0xFFCF}, {0xFFD2, 0xFFD7}, {0xFFDA, 0xFFDC}, {0x10000, 0x1000B}, {0x1000D, 0x10026}, {0x10028, 0x1003A}, {0x1003C, 0x1003D}, {0x1003F, 0x1004D}, {0x10050, 0x1005D}, {0x10080, 0x100FA},
-{0x10280, 0x1029C}, {0x102A0, 0x102D0}, {0x10300, 0x1031F}, {0x1032D, 0x10340}, {0x10342, 0x10349}, {0x10350, 0x10375}, {0x10380, 0x1039D}, {0x103A0, 0x103C3}, {0x103C8, 0x103CF}, {0x10400, 0x1049D},
-{0x104B0, 0x104D3}, {0x104D8, 0x104FB}, {0x10500, 0x10527}, {0x10530, 0x10563}, {0x10600, 0x10736}, {0x10740, 0x10755}, {0x10760, 0x10767}, {0x10800, 0x10805}, {0x10808, 0x10808}, {0x1080A, 0x10835},
-{0x10837, 0x10838}, {0x1083C, 0x1083C}, {0x1083F, 0x10855}, {0x10860, 0x10876}, {0x10880, 0x1089E}, {0x108E0, 0x108F2}, {0x108F4, 0x108F5}, {0x10900, 0x10915}, {0x10920, 0x10939}, {0x10980, 0x109B7},
-{0x109BE, 0x109BF}, {0x10A00, 0x10A00}, {0x10A10, 0x10A13}, {0x10A15, 0x10A17}, {0x10A19, 0x10A35}, {0x10A60, 0x10A7C}, {0x10A80, 0x10A9C}, {0x10AC0, 0x10AC7}, {0x10AC9, 0x10AE4}, {0x10B00, 0x10B35},
-{0x10B40, 0x10B55}, {0x10B60, 0x10B72}, {0x10B80, 0x10B91}, {0x10C00, 0x10C48}, {0x10C80, 0x10CB2}, {0x10CC0, 0x10CF2}, {0x10D00, 0x10D23}, {0x10E80, 0x10EA9}, {0x10EB0, 0x10EB1}, {0x10F00, 0x10F1C},
-{0x10F27, 0x10F27}, {0x10F30, 0x10F45}, {0x10FB0, 0x10FC4}, {0x10FE0, 0x10FF6}, {0x11003, 0x11037}, {0x11083, 0x110AF}, {0x110D0, 0x110E8}, {0x11103, 0x11126}, {0x11144, 0x11144}, {0x11147, 0x11147},
-{0x11150, 0x11172}, {0x11176, 0x11176}, {0x11183, 0x111B2}, {0x111C1, 0x111C4}, {0x111DA, 0x111DA}, {0x111DC, 0x111DC}, {0x11200, 0x11211}, {0x11213, 0x1122B}, {0x11280, 0x11286}, {0x11288, 0x11288},
-{0x1128A, 0x1128D}, {0x1128F, 0x1129D}, {0x1129F, 0x112A8}, {0x112B0, 0x112DE}, {0x11305, 0x1130C}, {0x1130F, 0x11310}, {0x11313, 0x11328}, {0x1132A, 0x11330}, {0x11332, 0x11333}, {0x11335, 0x11339},
-{0x1133D, 0x1133D}, {0x11350, 0x11350}, {0x1135D, 0x11361}, {0x11400, 0x11434}, {0x11447, 0x1144A}, {0x1145F, 0x11461}, {0x11480, 0x114AF}, {0x114C4, 0x114C5}, {0x114C7, 0x114C7}, {0x11580, 0x115AE},
-{0x115D8, 0x115DB}, {0x11600, 0x1162F}, {0x11644, 0x11644}, {0x11680, 0x116AA}, {0x116B8, 0x116B8}, {0x11700, 0x1171A}, {0x11800, 0x1182B}, {0x118A0, 0x118DF}, {0x118FF, 0x11906}, {0x11909, 0x11909},
-{0x1190C, 0x11913}, {0x11915, 0x11916}, {0x11918, 0x1192F}, {0x1193F, 0x1193F}, {0x11941, 0x11941}, {0x119A0, 0x119A7}, {0x119AA, 0x119D0}, {0x119E1, 0x119E1}, {0x119E3, 0x119E3}, {0x11A00, 0x11A00},
-{0x11A0B, 0x11A32}, {0x11A3A, 0x11A3A}, {0x11A50, 0x11A50}, {0x11A5C, 0x11A89}, {0x11A9D, 0x11A9D}, {0x11AC0, 0x11AF8}, {0x11C00, 0x11C08}, {0x11C0A, 0x11C2E}, {0x11C40, 0x11C40}, {0x11C72, 0x11C8F},
-{0x11D00, 0x11D06}, {0x11D08, 0x11D09}, {0x11D0B, 0x11D30}, {0x11D46, 0x11D46}, {0x11D60, 0x11D65}, {0x11D67, 0x11D68}, {0x11D6A, 0x11D89}, {0x11D98, 0x11D98}, {0x11EE0, 0x11EF2}, {0x11FB0, 0x11FB0},
-{0x12000, 0x12399}, {0x12480, 0x12543}, {0x13000, 0x1342E}, {0x14400, 0x14646}, {0x16800, 0x16A38}, {0x16A40, 0x16A5E}, {0x16AD0, 0x16AED}, {0x16B00, 0x16B2F}, {0x16B40, 0x16B43}, {0x16B63, 0x16B77},
-{0x16B7D, 0x16B8F}, {0x16E40, 0x16E7F}, {0x16F00, 0x16F4A}, {0x16F50, 0x16F50}, {0x16F93, 0x16F9F}, {0x16FE0, 0x16FE1}, {0x16FE3, 0x16FE3}, {0x17000, 0x187F7}, {0x18800, 0x18CD5}, {0x18D00, 0x18D08},
-{0x1B000, 0x1B11E}, {0x1B150, 0x1B152}, {0x1B164, 0x1B167}, {0x1B170, 0x1B2FB}, {0x1BC00, 0x1BC6A}, {0x1BC70, 0x1BC7C}, {0x1BC80, 0x1BC88}, {0x1BC90, 0x1BC99}, {0x1D400, 0x1D454}, {0x1D456, 0x1D49C},
-{0x1D49E, 0x1D49F}, {0x1D4A2, 0x1D4A2}, {0x1D4A5, 0x1D4A6}, {0x1D4A9, 0x1D4AC}, {0x1D4AE, 0x1D4B9}, {0x1D4BB, 0x1D4BB}, {0x1D4BD, 0x1D4C3}, {0x1D4C5, 0x1D505}, {0x1D507, 0x1D50A}, {0x1D50D, 0x1D514},
-{0x1D516, 0x1D51C}, {0x1D51E, 0x1D539}, {0x1D53B, 0x1D53E}, {0x1D540, 0x1D544}, {0x1D546, 0x1D546}, {0x1D54A, 0x1D550}, {0x1D552, 0x1D6A5}, {0x1D6A8, 0x1D6C0}, {0x1D6C2, 0x1D6DA}, {0x1D6DC, 0x1D6FA},
-{0x1D6FC, 0x1D714}, {0x1D716, 0x1D734}, {0x1D736, 0x1D74E}, {0x1D750, 0x1D76E}, {0x1D770, 0x1D788}, {0x1D78A, 0x1D7A8}, {0x1D7AA, 0x1D7C2}, {0x1D7C4, 0x1D7CB}, {0x1E100, 0x1E12C}, {0x1E137, 0x1E13D},
-{0x1E14E, 0x1E14E}, {0x1E2C0, 0x1E2EB}, {0x1E800, 0x1E8C4}, {0x1E900, 0x1E943}, {0x1E94B, 0x1E94B}, {0x1EE00, 0x1EE03}, {0x1EE05, 0x1EE1F}, {0x1EE21, 0x1EE22}, {0x1EE24, 0x1EE24}, {0x1EE27, 0x1EE27},
-{0x1EE29, 0x1EE32}, {0x1EE34, 0x1EE37}, {0x1EE39, 0x1EE39}, {0x1EE3B, 0x1EE3B}, {0x1EE42, 0x1EE42}, {0x1EE47, 0x1EE47}, {0x1EE49, 0x1EE49}, {0x1EE4B, 0x1EE4B}, {0x1EE4D, 0x1EE4F}, {0x1EE51, 0x1EE52},
-{0x1EE54, 0x1EE54}, {0x1EE57, 0x1EE57}, {0x1EE59, 0x1EE59}, {0x1EE5B, 0x1EE5B}, {0x1EE5D, 0x1EE5D}, {0x1EE5F, 0x1EE5F}, {0x1EE61, 0x1EE62}, {0x1EE64, 0x1EE64}, {0x1EE67, 0x1EE6A}, {0x1EE6C, 0x1EE72},
-{0x1EE74, 0x1EE77}, {0x1EE79, 0x1EE7C}, {0x1EE7E, 0x1EE7E}, {0x1EE80, 0x1EE89}, {0x1EE8B, 0x1EE9B}, {0x1EEA1, 0x1EEA3}, {0x1EEA5, 0x1EEA9}, {0x1EEAB, 0x1EEBB}, {0x20000, 0x2A6DD}, {0x2A700, 0x2B734},
-{0x2B740, 0x2B81D}, {0x2B820, 0x2CEA1}, {0x2CEB0, 0x2EBE0}, {0x2F800, 0x2FA1D}, {0x30000, 0x3134A},
-};
-
-static const std::vector<std::pair<int, int>> whitespace_ranges = {
-{0x9, 0xD}, {0x1C, 0x20}, {0x85, 0x85}, {0xA0, 0xA0}, {0x1680, 0x1680}, {0x2000, 0x200A}, {0x2028, 0x2029}, {0x202F, 0x202F}, {0x205F, 0x205F}, {0x3000, 0x3000},
-};
-
-static const std::vector<std::pair<int, int>> accent_mark_ranges = {
-{0x300, 0x36F}, {0x483, 0x489}, {0x591, 0x5BD}, {0x5BF, 0x5BF}, {0x5C1, 0x5C2}, {0x5C4, 0x5C5}, {0x5C7, 0x5C7}, {0x610, 0x61A}, {0x64B, 0x65F}, {0x670, 0x670}, {0x6D6, 0x6DC}, {0x6DF, 0x6E4},
-{0x6E7, 0x6E8}, {0x6EA, 0x6ED}, {0x711, 0x711}, {0x730, 0x74A}, {0x7A6, 0x7B0}, {0x7EB, 0x7F3}, {0x7FD, 0x7FD}, {0x816, 0x819}, {0x81B, 0x823}, {0x825, 0x827}, {0x829, 0x82D}, {0x859, 0x85B},
-{0x8D3, 0x8E1}, {0x8E3, 0x903}, {0x93A, 0x93C}, {0x93E, 0x94F}, {0x951, 0x957}, {0x962, 0x963}, {0x981, 0x983}, {0x9BC, 0x9BC}, {0x9BE, 0x9C4}, {0x9C7, 0x9C8}, {0x9CB, 0x9CD}, {0x9D7, 0x9D7},
-{0x9E2, 0x9E3}, {0x9FE, 0x9FE}, {0xA01, 0xA03}, {0xA3C, 0xA3C}, {0xA3E, 0xA42}, {0xA47, 0xA48}, {0xA4B, 0xA4D}, {0xA51, 0xA51}, {0xA70, 0xA71}, {0xA75, 0xA75}, {0xA81, 0xA83}, {0xABC, 0xABC},
-{0xABE, 0xAC5}, {0xAC7, 0xAC9}, {0xACB, 0xACD}, {0xAE2, 0xAE3}, {0xAFA, 0xAFF}, {0xB01, 0xB03}, {0xB3C, 0xB3C}, {0xB3E, 0xB44}, {0xB47, 0xB48}, {0xB4B, 0xB4D}, {0xB55, 0xB57}, {0xB62, 0xB63},
-{0xB82, 0xB82}, {0xBBE, 0xBC2}, {0xBC6, 0xBC8}, {0xBCA, 0xBCD}, {0xBD7, 0xBD7}, {0xC00, 0xC04}, {0xC3E, 0xC44}, {0xC46, 0xC48}, {0xC4A, 0xC4D}, {0xC55, 0xC56}, {0xC62, 0xC63}, {0xC81, 0xC83},
-{0xCBC, 0xCBC}, {0xCBE, 0xCC4}, {0xCC6, 0xCC8}, {0xCCA, 0xCCD}, {0xCD5, 0xCD6}, {0xCE2, 0xCE3}, {0xD00, 0xD03}, {0xD3B, 0xD3C}, {0xD3E, 0xD44}, {0xD46, 0xD48}, {0xD4A, 0xD4D}, {0xD57, 0xD57},
-{0xD62, 0xD63}, {0xD81, 0xD83}, {0xDCA, 0xDCA}, {0xDCF, 0xDD4}, {0xDD6, 0xDD6}, {0xDD8, 0xDDF}, {0xDF2, 0xDF3}, {0xE31, 0xE31}, {0xE34, 0xE3A}, {0xE47, 0xE4E}, {0xEB1, 0xEB1}, {0xEB4, 0xEBC},
-{0xEC8, 0xECD}, {0xF18, 0xF19}, {0xF35, 0xF35}, {0xF37, 0xF37}, {0xF39, 0xF39}, {0xF3E, 0xF3F}, {0xF71, 0xF84}, {0xF86, 0xF87}, {0xF8D, 0xF97}, {0xF99, 0xFBC}, {0xFC6, 0xFC6}, {0x102B, 0x103E},
-{0x1056, 0x1059}, {0x105E, 0x1060}, {0x1062, 0x1064}, {0x1067, 0x106D}, {0x1071, 0x1074}, {0x1082, 0x108D}, {0x108F, 0x108F}, {0x109A, 0x109D}, {0x135D, 0x135F}, {0x1712, 0x1714}, {0x1732, 0x1734},
-{0x1752, 0x1753}, {0x1772, 0x1773}, {0x17B4, 0x17D3}, {0x17DD, 0x17DD}, {0x180B, 0x180D}, {0x1885, 0x1886}, {0x18A9, 0x18A9}, {0x1920, 0x192B}, {0x1930, 0x193B}, {0x1A17, 0x1A1B}, {0x1A55, 0x1A5E},
-{0x1A60, 0x1A7C}, {0x1A7F, 0x1A7F}, {0x1AB0, 0x1AC0}, {0x1B00, 0x1B04}, {0x1B34, 0x1B44}, {0x1B6B, 0x1B73}, {0x1B80, 0x1B82}, {0x1BA1, 0x1BAD}, {0x1BE6, 0x1BF3}, {0x1C24, 0x1C37}, {0x1CD0, 0x1CD2},
-{0x1CD4, 0x1CE8}, {0x1CED, 0x1CED}, {0x1CF4, 0x1CF4}, {0x1CF7, 0x1CF9}, {0x1DC0, 0x1DF9}, {0x1DFB, 0x1DFF}, {0x20D0, 0x20F0}, {0x2CEF, 0x2CF1}, {0x2D7F, 0x2D7F}, {0x2DE0, 0x2DFF}, {0x302A, 0x302F},
-{0x3099, 0x309A}, {0xA66F, 0xA672}, {0xA674, 0xA67D}, {0xA69E, 0xA69F}, {0xA6F0, 0xA6F1}, {0xA802, 0xA802}, {0xA806, 0xA806}, {0xA80B, 0xA80B}, {0xA823, 0xA827}, {0xA82C, 0xA82C}, {0xA880, 0xA881},
-{0xA8B4, 0xA8C5}, {0xA8E0, 0xA8F1}, {0xA8FF, 0xA8FF}, {0xA926, 0xA92D}, {0xA947, 0xA953}, {0xA980, 0xA983}, {0xA9B3, 0xA9C0}, {0xA9E5, 0xA9E5}, {0xAA29, 0xAA36}, {0xAA43, 0xAA43}, {0xAA4C, 0xAA4D},
-{0xAA7B, 0xAA7D}, {0xAAB0, 0xAAB0}, {0xAAB2, 0xAAB4}, {0xAAB7, 0xAAB8}, {0xAABE, 0xAABF}, {0xAAC1, 0xAAC1}, {0xAAEB, 0xAAEF}, {0xAAF5, 0xAAF6}, {0xABE3, 0xABEA}, {0xABEC, 0xABED}, {0xFB1E, 0xFB1E},
-{0xFE00, 0xFE0F}, {0xFE20, 0xFE2F}, {0x101FD, 0x101FD}, {0x102E0, 0x102E0}, {0x10376, 0x1037A}, {0x10A01, 0x10A03}, {0x10A05, 0x10A06}, {0x10A0C, 0x10A0F}, {0x10A38, 0x10A3A}, {0x10A3F, 0x10A3F},
-{0x10AE5, 0x10AE6}, {0x10D24, 0x10D27}, {0x10EAB, 0x10EAC}, {0x10F46, 0x10F50}, {0x11000, 0x11002}, {0x11038, 0x11046}, {0x1107F, 0x11082}, {0x110B0, 0x110BA}, {0x11100, 0x11102}, {0x11127, 0x11134},
-{0x11145, 0x11146}, {0x11173, 0x11173}, {0x11180, 0x11182}, {0x111B3, 0x111C0}, {0x111C9, 0x111CC}, {0x111CE, 0x111CF}, {0x1122C, 0x11237}, {0x1123E, 0x1123E}, {0x112DF, 0x112EA}, {0x11300, 0x11303},
-{0x1133B, 0x1133C}, {0x1133E, 0x11344}, {0x11347, 0x11348}, {0x1134B, 0x1134D}, {0x11357, 0x11357}, {0x11362, 0x11363}, {0x11366, 0x1136C}, {0x11370, 0x11374}, {0x11435, 0x11446}, {0x1145E, 0x1145E},
-{0x114B0, 0x114C3}, {0x115AF, 0x115B5}, {0x115B8, 0x115C0}, {0x115DC, 0x115DD}, {0x11630, 0x11640}, {0x116AB, 0x116B7}, {0x1171D, 0x1172B}, {0x1182C, 0x1183A}, {0x11930, 0x11935}, {0x11937, 0x11938},
-{0x1193B, 0x1193E}, {0x11940, 0x11940}, {0x11942, 0x11943}, {0x119D1, 0x119D7}, {0x119DA, 0x119E0}, {0x119E4, 0x119E4}, {0x11A01, 0x11A0A}, {0x11A33, 0x11A39}, {0x11A3B, 0x11A3E}, {0x11A47, 0x11A47},
-{0x11A51, 0x11A5B}, {0x11A8A, 0x11A99}, {0x11C2F, 0x11C36}, {0x11C38, 0x11C3F}, {0x11C92, 0x11CA7}, {0x11CA9, 0x11CB6}, {0x11D31, 0x11D36}, {0x11D3A, 0x11D3A}, {0x11D3C, 0x11D3D}, {0x11D3F, 0x11D45},
-{0x11D47, 0x11D47}, {0x11D8A, 0x11D8E}, {0x11D90, 0x11D91}, {0x11D93, 0x11D97}, {0x11EF3, 0x11EF6}, {0x16AF0, 0x16AF4}, {0x16B30, 0x16B36}, {0x16F4F, 0x16F4F}, {0x16F51, 0x16F87}, {0x16F8F, 0x16F92},
-{0x16FE4, 0x16FE4}, {0x16FF0, 0x16FF1}, {0x1BC9D, 0x1BC9E}, {0x1D165, 0x1D169}, {0x1D16D, 0x1D172}, {0x1D17B, 0x1D182}, {0x1D185, 0x1D18B}, {0x1D1AA, 0x1D1AD}, {0x1D242, 0x1D244}, {0x1DA00, 0x1DA36},
-{0x1DA3B, 0x1DA6C}, {0x1DA75, 0x1DA75}, {0x1DA84, 0x1DA84}, {0x1DA9B, 0x1DA9F}, {0x1DAA1, 0x1DAAF}, {0x1E000, 0x1E006}, {0x1E008, 0x1E018}, {0x1E01B, 0x1E021}, {0x1E023, 0x1E024}, {0x1E026, 0x1E02A},
-{0x1E130, 0x1E136}, {0x1E2EC, 0x1E2EF}, {0x1E8D0, 0x1E8D6}, {0x1E944, 0x1E94A}, {0xE0100, 0xE01EF},
-};
-
-static const std::vector<std::pair<int, int>> punctuation_ranges = {
-{0x21, 0x23}, {0x25, 0x2A}, {0x2C, 0x2F}, {0x3A, 0x3B}, {0x3F, 0x40}, {0x5B, 0x5D}, {0x5F, 0x5F}, {0x7B, 0x7B}, {0x7D, 0x7D}, {0xA1, 0xA1}, {0xA7, 0xA7}, {0xAB, 0xAB}, {0xB6, 0xB7}, {0xBB, 0xBB},
-{0xBF, 0xBF}, {0x37E, 0x37E}, {0x387, 0x387}, {0x55A, 0x55F}, {0x589, 0x58A}, {0x5BE, 0x5BE}, {0x5C0, 0x5C0}, {0x5C3, 0x5C3}, {0x5C6, 0x5C6}, {0x5F3, 0x5F4}, {0x609, 0x60A}, {0x60C, 0x60D},
-{0x61B, 0x61B}, {0x61E, 0x61F}, {0x66A, 0x66D}, {0x6D4, 0x6D4}, {0x700, 0x70D}, {0x7F7, 0x7F9}, {0x830, 0x83E}, {0x85E, 0x85E}, {0x964, 0x965}, {0x970, 0x970}, {0x9FD, 0x9FD}, {0xA76, 0xA76},
-{0xAF0, 0xAF0}, {0xC77, 0xC77}, {0xC84, 0xC84}, {0xDF4, 0xDF4}, {0xE4F, 0xE4F}, {0xE5A, 0xE5B}, {0xF04, 0xF12}, {0xF14, 0xF14}, {0xF3A, 0xF3D}, {0xF85, 0xF85}, {0xFD0, 0xFD4}, {0xFD9, 0xFDA},
-{0x104A, 0x104F}, {0x10FB, 0x10FB}, {0x1360, 0x1368}, {0x1400, 0x1400}, {0x166E, 0x166E}, {0x169B, 0x169C}, {0x16EB, 0x16ED}, {0x1735, 0x1736}, {0x17D4, 0x17D6}, {0x17D8, 0x17DA}, {0x1800, 0x180A},
-{0x1944, 0x1945}, {0x1A1E, 0x1A1F}, {0x1AA0, 0x1AA6}, {0x1AA8, 0x1AAD}, {0x1B5A, 0x1B60}, {0x1BFC, 0x1BFF}, {0x1C3B, 0x1C3F}, {0x1C7E, 0x1C7F}, {0x1CC0, 0x1CC7}, {0x1CD3, 0x1CD3}, {0x2010, 0x2027},
-{0x2030, 0x2043}, {0x2045, 0x2051}, {0x2053, 0x205E}, {0x207D, 0x207E}, {0x208D, 0x208E}, {0x2308, 0x230B}, {0x2329, 0x232A}, {0x2768, 0x2775}, {0x27C5, 0x27C6}, {0x27E6, 0x27EF}, {0x2983, 0x2998},
-{0x29D8, 0x29DB}, {0x29FC, 0x29FD}, {0x2CF9, 0x2CFC}, {0x2CFE, 0x2CFF}, {0x2D70, 0x2D70}, {0x2E00, 0x2E2E}, {0x2E30, 0x2E4F}, {0x2E52, 0x2E52}, {0x3001, 0x3003}, {0x3008, 0x3011}, {0x3014, 0x301F},
-{0x3030, 0x3030}, {0x303D, 0x303D}, {0x30A0, 0x30A0}, {0x30FB, 0x30FB}, {0xA4FE, 0xA4FF}, {0xA60D, 0xA60F}, {0xA673, 0xA673}, {0xA67E, 0xA67E}, {0xA6F2, 0xA6F7}, {0xA874, 0xA877}, {0xA8CE, 0xA8CF},
-{0xA8F8, 0xA8FA}, {0xA8FC, 0xA8FC}, {0xA92E, 0xA92F}, {0xA95F, 0xA95F}, {0xA9C1, 0xA9CD}, {0xA9DE, 0xA9DF}, {0xAA5C, 0xAA5F}, {0xAADE, 0xAADF}, {0xAAF0, 0xAAF1}, {0xABEB, 0xABEB}, {0xFD3E, 0xFD3F},
-{0xFE10, 0xFE19}, {0xFE30, 0xFE52}, {0xFE54, 0xFE61}, {0xFE63, 0xFE63}, {0xFE68, 0xFE68}, {0xFE6A, 0xFE6B}, {0xFF01, 0xFF03}, {0xFF05, 0xFF0A}, {0xFF0C, 0xFF0F}, {0xFF1A, 0xFF1B}, {0xFF1F, 0xFF20},
-{0xFF3B, 0xFF3D}, {0xFF3F, 0xFF3F}, {0xFF5B, 0xFF5B}, {0xFF5D, 0xFF5D}, {0xFF5F, 0xFF65}, {0x10100, 0x10102}, {0x1039F, 0x1039F}, {0x103D0, 0x103D0}, {0x1056F, 0x1056F}, {0x10857, 0x10857},
-{0x1091F, 0x1091F}, {0x1093F, 0x1093F}, {0x10A50, 0x10A58}, {0x10A7F, 0x10A7F}, {0x10AF0, 0x10AF6}, {0x10B39, 0x10B3F}, {0x10B99, 0x10B9C}, {0x10EAD, 0x10EAD}, {0x10F55, 0x10F59}, {0x11047, 0x1104D},
-{0x110BB, 0x110BC}, {0x110BE, 0x110C1}, {0x11140, 0x11143}, {0x11174, 0x11175}, {0x111C5, 0x111C8}, {0x111CD, 0x111CD}, {0x111DB, 0x111DB}, {0x111DD, 0x111DF}, {0x11238, 0x1123D}, {0x112A9, 0x112A9},
-{0x1144B, 0x1144F}, {0x1145A, 0x1145B}, {0x1145D, 0x1145D}, {0x114C6, 0x114C6}, {0x115C1, 0x115D7}, {0x11641, 0x11643}, {0x11660, 0x1166C}, {0x1173C, 0x1173E}, {0x1183B, 0x1183B}, {0x11944, 0x11946},
-{0x119E2, 0x119E2}, {0x11A3F, 0x11A46}, {0x11A9A, 0x11A9C}, {0x11A9E, 0x11AA2}, {0x11C41, 0x11C45}, {0x11C70, 0x11C71}, {0x11EF7, 0x11EF8}, {0x11FFF, 0x11FFF}, {0x12470, 0x12474}, {0x16A6E, 0x16A6F},
-{0x16AF5, 0x16AF5}, {0x16B37, 0x16B3B}, {0x16B44, 0x16B44}, {0x16E97, 0x16E9A}, {0x16FE2, 0x16FE2}, {0x1BC9F, 0x1BC9F}, {0x1DA87, 0x1DA8B}, {0x1E95E, 0x1E95F},
-};
-
-static const std::vector<std::pair<int, int>> symbol_ranges = {
-{0x24, 0x24}, {0x2B, 0x2B}, {0x3C, 0x3E}, {0x5E, 0x5E}, {0x60, 0x60}, {0x7C, 0x7C}, {0x7E, 0x7E}, {0xA2, 0xA6}, {0xA8, 0xA9}, {0xAC, 0xAC}, {0xAE, 0xB1}, {0xB4, 0xB4}, {0xB8, 0xB8}, {0xD7, 0xD7},
-{0xF7, 0xF7}, {0x2C2, 0x2C5}, {0x2D2, 0x2DF}, {0x2E5, 0x2EB}, {0x2ED, 0x2ED}, {0x2EF, 0x2FF}, {0x375, 0x375}, {0x384, 0x385}, {0x3F6, 0x3F6}, {0x482, 0x482}, {0x58D, 0x58F}, {0x606, 0x608},
-{0x60B, 0x60B}, {0x60E, 0x60F}, {0x6DE, 0x6DE}, {0x6E9, 0x6E9}, {0x6FD, 0x6FE}, {0x7F6, 0x7F6}, {0x7FE, 0x7FF}, {0x9F2, 0x9F3}, {0x9FA, 0x9FB}, {0xAF1, 0xAF1}, {0xB70, 0xB70}, {0xBF3, 0xBFA},
-{0xC7F, 0xC7F}, {0xD4F, 0xD4F}, {0xD79, 0xD79}, {0xE3F, 0xE3F}, {0xF01, 0xF03}, {0xF13, 0xF13}, {0xF15, 0xF17}, {0xF1A, 0xF1F}, {0xF34, 0xF34}, {0xF36, 0xF36}, {0xF38, 0xF38}, {0xFBE, 0xFC5},
-{0xFC7, 0xFCC}, {0xFCE, 0xFCF}, {0xFD5, 0xFD8}, {0x109E, 0x109F}, {0x1390, 0x1399}, {0x166D, 0x166D}, {0x17DB, 0x17DB}, {0x1940, 0x1940}, {0x19DE, 0x19FF}, {0x1B61, 0x1B6A}, {0x1B74, 0x1B7C},
-{0x1FBD, 0x1FBD}, {0x1FBF, 0x1FC1}, {0x1FCD, 0x1FCF}, {0x1FDD, 0x1FDF}, {0x1FED, 0x1FEF}, {0x1FFD, 0x1FFE}, {0x2044, 0x2044}, {0x2052, 0x2052}, {0x207A, 0x207C}, {0x208A, 0x208C}, {0x20A0, 0x20BF},
-{0x2100, 0x2101}, {0x2103, 0x2106}, {0x2108, 0x2109}, {0x2114, 0x2114}, {0x2116, 0x2118}, {0x211E, 0x2123}, {0x2125, 0x2125}, {0x2127, 0x2127}, {0x2129, 0x2129}, {0x212E, 0x212E}, {0x213A, 0x213B},
-{0x2140, 0x2144}, {0x214A, 0x214D}, {0x214F, 0x214F}, {0x218A, 0x218B}, {0x2190, 0x2307}, {0x230C, 0x2328}, {0x232B, 0x2426}, {0x2440, 0x244A}, {0x249C, 0x24E9}, {0x2500, 0x2767}, {0x2794, 0x27C4},
-{0x27C7, 0x27E5}, {0x27F0, 0x2982}, {0x2999, 0x29D7}, {0x29DC, 0x29FB}, {0x29FE, 0x2B73}, {0x2B76, 0x2B95}, {0x2B97, 0x2BFF}, {0x2CE5, 0x2CEA}, {0x2E50, 0x2E51}, {0x2E80, 0x2E99}, {0x2E9B, 0x2EF3},
-{0x2F00, 0x2FD5}, {0x2FF0, 0x2FFB}, {0x3004, 0x3004}, {0x3012, 0x3013}, {0x3020, 0x3020}, {0x3036, 0x3037}, {0x303E, 0x303F}, {0x309B, 0x309C}, {0x3190, 0x3191}, {0x3196, 0x319F}, {0x31C0, 0x31E3},
-{0x3200, 0x321E}, {0x322A, 0x3247}, {0x3250, 0x3250}, {0x3260, 0x327F}, {0x328A, 0x32B0}, {0x32C0, 0x33FF}, {0x4DC0, 0x4DFF}, {0xA490, 0xA4C6}, {0xA700, 0xA716}, {0xA720, 0xA721}, {0xA789, 0xA78A},
-{0xA828, 0xA82B}, {0xA836, 0xA839}, {0xAA77, 0xAA79}, {0xAB5B, 0xAB5B}, {0xAB6A, 0xAB6B}, {0xFB29, 0xFB29}, {0xFBB2, 0xFBC1}, {0xFDFC, 0xFDFD}, {0xFE62, 0xFE62}, {0xFE64, 0xFE66}, {0xFE69, 0xFE69},
-{0xFF04, 0xFF04}, {0xFF0B, 0xFF0B}, {0xFF1C, 0xFF1E}, {0xFF3E, 0xFF3E}, {0xFF40, 0xFF40}, {0xFF5C, 0xFF5C}, {0xFF5E, 0xFF5E}, {0xFFE0, 0xFFE6}, {0xFFE8, 0xFFEE}, {0xFFFC, 0xFFFD}, {0x10137, 0x1013F},
-{0x10179, 0x10189}, {0x1018C, 0x1018E}, {0x10190, 0x1019C}, {0x101A0, 0x101A0}, {0x101D0, 0x101FC}, {0x10877, 0x10878}, {0x10AC8, 0x10AC8}, {0x1173F, 0x1173F}, {0x11FD5, 0x11FF1}, {0x16B3C, 0x16B3F},
-{0x16B45, 0x16B45}, {0x1BC9C, 0x1BC9C}, {0x1D000, 0x1D0F5}, {0x1D100, 0x1D126}, {0x1D129, 0x1D164}, {0x1D16A, 0x1D16C}, {0x1D183, 0x1D184}, {0x1D18C, 0x1D1A9}, {0x1D1AE, 0x1D1E8}, {0x1D200, 0x1D241},
-{0x1D245, 0x1D245}, {0x1D300, 0x1D356}, {0x1D6C1, 0x1D6C1}, {0x1D6DB, 0x1D6DB}, {0x1D6FB, 0x1D6FB}, {0x1D715, 0x1D715}, {0x1D735, 0x1D735}, {0x1D74F, 0x1D74F}, {0x1D76F, 0x1D76F}, {0x1D789, 0x1D789},
-{0x1D7A9, 0x1D7A9}, {0x1D7C3, 0x1D7C3}, {0x1D800, 0x1D9FF}, {0x1DA37, 0x1DA3A}, {0x1DA6D, 0x1DA74}, {0x1DA76, 0x1DA83}, {0x1DA85, 0x1DA86}, {0x1E14F, 0x1E14F}, {0x1E2FF, 0x1E2FF}, {0x1ECAC, 0x1ECAC},
-{0x1ECB0, 0x1ECB0}, {0x1ED2E, 0x1ED2E}, {0x1EEF0, 0x1EEF1}, {0x1F000, 0x1F02B}, {0x1F030, 0x1F093}, {0x1F0A0, 0x1F0AE}, {0x1F0B1, 0x1F0BF}, {0x1F0C1, 0x1F0CF}, {0x1F0D1, 0x1F0F5}, {0x1F10D, 0x1F1AD},
-{0x1F1E6, 0x1F202}, {0x1F210, 0x1F23B}, {0x1F240, 0x1F248}, {0x1F250, 0x1F251}, {0x1F260, 0x1F265}, {0x1F300, 0x1F6D7}, {0x1F6E0, 0x1F6EC}, {0x1F6F0, 0x1F6FC}, {0x1F700, 0x1F773}, {0x1F780, 0x1F7D8},
-{0x1F7E0, 0x1F7EB}, {0x1F800, 0x1F80B}, {0x1F810, 0x1F847}, {0x1F850, 0x1F859}, {0x1F860, 0x1F887}, {0x1F890, 0x1F8AD}, {0x1F8B0, 0x1F8B1}, {0x1F900, 0x1F978}, {0x1F97A, 0x1F9CB}, {0x1F9CD, 0x1FA53},
-{0x1FA60, 0x1FA6D}, {0x1FA70, 0x1FA74}, {0x1FA78, 0x1FA7A}, {0x1FA80, 0x1FA86}, {0x1FA90, 0x1FAA8}, {0x1FAB0, 0x1FAB6}, {0x1FAC0, 0x1FAC2}, {0x1FAD0, 0x1FAD6}, {0x1FB00, 0x1FB92}, {0x1FB94, 0x1FBCA},
-};
-
-static const std::vector<std::pair<int, int>> control_ranges = {
-{0x0, 0x8}, {0xE, 0x1B}, {0x7F, 0x84}, {0x86, 0x9F}, {0xAD, 0xAD}, {0x378, 0x379}, {0x380, 0x383}, {0x38B, 0x38B}, {0x38D, 0x38D}, {0x3A2, 0x3A2}, {0x530, 0x530}, {0x557, 0x558}, {0x58B, 0x58C},
-{0x590, 0x590}, {0x5C8, 0x5CF}, {0x5EB, 0x5EE}, {0x5F5, 0x605}, {0x61C, 0x61D}, {0x6DD, 0x6DD}, {0x70E, 0x70F}, {0x74B, 0x74C}, {0x7B2, 0x7BF}, {0x7FB, 0x7FC}, {0x82E, 0x82F}, {0x83F, 0x83F},
-{0x85C, 0x85D}, {0x85F, 0x85F}, {0x86B, 0x89F}, {0x8B5, 0x8B5}, {0x8C8, 0x8D2}, {0x8E2, 0x8E2}, {0x984, 0x984}, {0x98D, 0x98E}, {0x991, 0x992}, {0x9A9, 0x9A9}, {0x9B1, 0x9B1}, {0x9B3, 0x9B5},
-{0x9BA, 0x9BB}, {0x9C5, 0x9C6}, {0x9C9, 0x9CA}, {0x9CF, 0x9D6}, {0x9D8, 0x9DB}, {0x9DE, 0x9DE}, {0x9E4, 0x9E5}, {0x9FF, 0xA00}, {0xA04, 0xA04}, {0xA0B, 0xA0E}, {0xA11, 0xA12}, {0xA29, 0xA29},
-{0xA31, 0xA31}, {0xA34, 0xA34}, {0xA37, 0xA37}, {0xA3A, 0xA3B}, {0xA3D, 0xA3D}, {0xA43, 0xA46}, {0xA49, 0xA4A}, {0xA4E, 0xA50}, {0xA52, 0xA58}, {0xA5D, 0xA5D}, {0xA5F, 0xA65}, {0xA77, 0xA80},
-{0xA84, 0xA84}, {0xA8E, 0xA8E}, {0xA92, 0xA92}, {0xAA9, 0xAA9}, {0xAB1, 0xAB1}, {0xAB4, 0xAB4}, {0xABA, 0xABB}, {0xAC6, 0xAC6}, {0xACA, 0xACA}, {0xACE, 0xACF}, {0xAD1, 0xADF}, {0xAE4, 0xAE5},
-{0xAF2, 0xAF8}, {0xB00, 0xB00}, {0xB04, 0xB04}, {0xB0D, 0xB0E}, {0xB11, 0xB12}, {0xB29, 0xB29}, {0xB31, 0xB31}, {0xB34, 0xB34}, {0xB3A, 0xB3B}, {0xB45, 0xB46}, {0xB49, 0xB4A}, {0xB4E, 0xB54},
-{0xB58, 0xB5B}, {0xB5E, 0xB5E}, {0xB64, 0xB65}, {0xB78, 0xB81}, {0xB84, 0xB84}, {0xB8B, 0xB8D}, {0xB91, 0xB91}, {0xB96, 0xB98}, {0xB9B, 0xB9B}, {0xB9D, 0xB9D}, {0xBA0, 0xBA2}, {0xBA5, 0xBA7},
-{0xBAB, 0xBAD}, {0xBBA, 0xBBD}, {0xBC3, 0xBC5}, {0xBC9, 0xBC9}, {0xBCE, 0xBCF}, {0xBD1, 0xBD6}, {0xBD8, 0xBE5}, {0xBFB, 0xBFF}, {0xC0D, 0xC0D}, {0xC11, 0xC11}, {0xC29, 0xC29}, {0xC3A, 0xC3C},
-{0xC45, 0xC45}, {0xC49, 0xC49}, {0xC4E, 0xC54}, {0xC57, 0xC57}, {0xC5B, 0xC5F}, {0xC64, 0xC65}, {0xC70, 0xC76}, {0xC8D, 0xC8D}, {0xC91, 0xC91}, {0xCA9, 0xCA9}, {0xCB4, 0xCB4}, {0xCBA, 0xCBB},
-{0xCC5, 0xCC5}, {0xCC9, 0xCC9}, {0xCCE, 0xCD4}, {0xCD7, 0xCDD}, {0xCDF, 0xCDF}, {0xCE4, 0xCE5}, {0xCF0, 0xCF0}, {0xCF3, 0xCFF}, {0xD0D, 0xD0D}, {0xD11, 0xD11}, {0xD45, 0xD45}, {0xD49, 0xD49},
-{0xD50, 0xD53}, {0xD64, 0xD65}, {0xD80, 0xD80}, {0xD84, 0xD84}, {0xD97, 0xD99}, {0xDB2, 0xDB2}, {0xDBC, 0xDBC}, {0xDBE, 0xDBF}, {0xDC7, 0xDC9}, {0xDCB, 0xDCE}, {0xDD5, 0xDD5}, {0xDD7, 0xDD7},
-{0xDE0, 0xDE5}, {0xDF0, 0xDF1}, {0xDF5, 0xE00}, {0xE3B, 0xE3E}, {0xE5C, 0xE80}, {0xE83, 0xE83}, {0xE85, 0xE85}, {0xE8B, 0xE8B}, {0xEA4, 0xEA4}, {0xEA6, 0xEA6}, {0xEBE, 0xEBF}, {0xEC5, 0xEC5},
-{0xEC7, 0xEC7}, {0xECE, 0xECF}, {0xEDA, 0xEDB}, {0xEE0, 0xEFF}, {0xF48, 0xF48}, {0xF6D, 0xF70}, {0xF98, 0xF98}, {0xFBD, 0xFBD}, {0xFCD, 0xFCD}, {0xFDB, 0xFFF}, {0x10C6, 0x10C6}, {0x10C8, 0x10CC},
-{0x10CE, 0x10CF}, {0x1249, 0x1249}, {0x124E, 0x124F}, {0x1257, 0x1257}, {0x1259, 0x1259}, {0x125E, 0x125F}, {0x1289, 0x1289}, {0x128E, 0x128F}, {0x12B1, 0x12B1}, {0x12B6, 0x12B7}, {0x12BF, 0x12BF},
-{0x12C1, 0x12C1}, {0x12C6, 0x12C7}, {0x12D7, 0x12D7}, {0x1311, 0x1311}, {0x1316, 0x1317}, {0x135B, 0x135C}, {0x137D, 0x137F}, {0x139A, 0x139F}, {0x13F6, 0x13F7}, {0x13FE, 0x13FF}, {0x169D, 0x169F},
-{0x16F9, 0x16FF}, {0x170D, 0x170D}, {0x1715, 0x171F}, {0x1737, 0x173F}, {0x1754, 0x175F}, {0x176D, 0x176D}, {0x1771, 0x1771}, {0x1774, 0x177F}, {0x17DE, 0x17DF}, {0x17EA, 0x17EF}, {0x17FA, 0x17FF},
-{0x180E, 0x180F}, {0x181A, 0x181F}, {0x1879, 0x187F}, {0x18AB, 0x18AF}, {0x18F6, 0x18FF}, {0x191F, 0x191F}, {0x192C, 0x192F}, {0x193C, 0x193F}, {0x1941, 0x1943}, {0x196E, 0x196F}, {0x1975, 0x197F},
-{0x19AC, 0x19AF}, {0x19CA, 0x19CF}, {0x19DB, 0x19DD}, {0x1A1C, 0x1A1D}, {0x1A5F, 0x1A5F}, {0x1A7D, 0x1A7E}, {0x1A8A, 0x1A8F}, {0x1A9A, 0x1A9F}, {0x1AAE, 0x1AAF}, {0x1AC1, 0x1AFF}, {0x1B4C, 0x1B4F},
-{0x1B7D, 0x1B7F}, {0x1BF4, 0x1BFB}, {0x1C38, 0x1C3A}, {0x1C4A, 0x1C4C}, {0x1C89, 0x1C8F}, {0x1CBB, 0x1CBC}, {0x1CC8, 0x1CCF}, {0x1CFB, 0x1CFF}, {0x1DFA, 0x1DFA}, {0x1F16, 0x1F17}, {0x1F1E, 0x1F1F},
-{0x1F46, 0x1F47}, {0x1F4E, 0x1F4F}, {0x1F58, 0x1F58}, {0x1F5A, 0x1F5A}, {0x1F5C, 0x1F5C}, {0x1F5E, 0x1F5E}, {0x1F7E, 0x1F7F}, {0x1FB5, 0x1FB5}, {0x1FC5, 0x1FC5}, {0x1FD4, 0x1FD5}, {0x1FDC, 0x1FDC},
-{0x1FF0, 0x1FF1}, {0x1FF5, 0x1FF5}, {0x1FFF, 0x1FFF}, {0x200B, 0x200F}, {0x202A, 0x202E}, {0x2060, 0x206F}, {0x2072, 0x2073}, {0x208F, 0x208F}, {0x209D, 0x209F}, {0x20C0, 0x20CF}, {0x20F1, 0x20FF},
-{0x218C, 0x218F}, {0x2427, 0x243F}, {0x244B, 0x245F}, {0x2B74, 0x2B75}, {0x2B96, 0x2B96}, {0x2C2F, 0x2C2F}, {0x2C5F, 0x2C5F}, {0x2CF4, 0x2CF8}, {0x2D26, 0x2D26}, {0x2D28, 0x2D2C}, {0x2D2E, 0x2D2F},
-{0x2D68, 0x2D6E}, {0x2D71, 0x2D7E}, {0x2D97, 0x2D9F}, {0x2DA7, 0x2DA7}, {0x2DAF, 0x2DAF}, {0x2DB7, 0x2DB7}, {0x2DBF, 0x2DBF}, {0x2DC7, 0x2DC7}, {0x2DCF, 0x2DCF}, {0x2DD7, 0x2DD7}, {0x2DDF, 0x2DDF},
-{0x2E53, 0x2E7F}, {0x2E9A, 0x2E9A}, {0x2EF4, 0x2EFF}, {0x2FD6, 0x2FEF}, {0x2FFC, 0x2FFF}, {0x3040, 0x3040}, {0x3097, 0x3098}, {0x3100, 0x3104}, {0x3130, 0x3130}, {0x318F, 0x318F}, {0x31E4, 0x31EF},
-{0x321F, 0x321F}, {0x9FFD, 0x9FFF}, {0xA48D, 0xA48F}, {0xA4C7, 0xA4CF}, {0xA62C, 0xA63F}, {0xA6F8, 0xA6FF}, {0xA7C0, 0xA7C1}, {0xA7CB, 0xA7F4}, {0xA82D, 0xA82F}, {0xA83A, 0xA83F}, {0xA878, 0xA87F},
-{0xA8C6, 0xA8CD}, {0xA8DA, 0xA8DF}, {0xA954, 0xA95E}, {0xA97D, 0xA97F}, {0xA9CE, 0xA9CE}, {0xA9DA, 0xA9DD}, {0xA9FF, 0xA9FF}, {0xAA37, 0xAA3F}, {0xAA4E, 0xAA4F}, {0xAA5A, 0xAA5B}, {0xAAC3, 0xAADA},
-{0xAAF7, 0xAB00}, {0xAB07, 0xAB08}, {0xAB0F, 0xAB10}, {0xAB17, 0xAB1F}, {0xAB27, 0xAB27}, {0xAB2F, 0xAB2F}, {0xAB6C, 0xAB6F}, {0xABEE, 0xABEF}, {0xABFA, 0xABFF}, {0xD7A4, 0xD7AF}, {0xD7C7, 0xD7CA},
-{0xD7FC, 0xF8FF}, {0xFA6E, 0xFA6F}, {0xFADA, 0xFAFF}, {0xFB07, 0xFB12}, {0xFB18, 0xFB1C}, {0xFB37, 0xFB37}, {0xFB3D, 0xFB3D}, {0xFB3F, 0xFB3F}, {0xFB42, 0xFB42}, {0xFB45, 0xFB45}, {0xFBC2, 0xFBD2},
-{0xFD40, 0xFD4F}, {0xFD90, 0xFD91}, {0xFDC8, 0xFDEF}, {0xFDFE, 0xFDFF}, {0xFE1A, 0xFE1F}, {0xFE53, 0xFE53}, {0xFE67, 0xFE67}, {0xFE6C, 0xFE6F}, {0xFE75, 0xFE75}, {0xFEFD, 0xFF00}, {0xFFBF, 0xFFC1},
-{0xFFC8, 0xFFC9}, {0xFFD0, 0xFFD1}, {0xFFD8, 0xFFD9}, {0xFFDD, 0xFFDF}, {0xFFE7, 0xFFE7}, {0xFFEF, 0xFFFB}, {0xFFFE, 0xFFFF}, {0x1000C, 0x1000C}, {0x10027, 0x10027}, {0x1003B, 0x1003B},
-{0x1003E, 0x1003E}, {0x1004E, 0x1004F}, {0x1005E, 0x1007F}, {0x100FB, 0x100FF}, {0x10103, 0x10106}, {0x10134, 0x10136}, {0x1018F, 0x1018F}, {0x1019D, 0x1019F}, {0x101A1, 0x101CF}, {0x101FE, 0x1027F},
-{0x1029D, 0x1029F}, {0x102D1, 0x102DF}, {0x102FC, 0x102FF}, {0x10324, 0x1032C}, {0x1034B, 0x1034F}, {0x1037B, 0x1037F}, {0x1039E, 0x1039E}, {0x103C4, 0x103C7}, {0x103D6, 0x103FF}, {0x1049E, 0x1049F},
-{0x104AA, 0x104AF}, {0x104D4, 0x104D7}, {0x104FC, 0x104FF}, {0x10528, 0x1052F}, {0x10564, 0x1056E}, {0x10570, 0x105FF}, {0x10737, 0x1073F}, {0x10756, 0x1075F}, {0x10768, 0x107FF}, {0x10806, 0x10807},
-{0x10809, 0x10809}, {0x10836, 0x10836}, {0x10839, 0x1083B}, {0x1083D, 0x1083E}, {0x10856, 0x10856}, {0x1089F, 0x108A6}, {0x108B0, 0x108DF}, {0x108F3, 0x108F3}, {0x108F6, 0x108FA}, {0x1091C, 0x1091E},
-{0x1093A, 0x1093E}, {0x10940, 0x1097F}, {0x109B8, 0x109BB}, {0x109D0, 0x109D1}, {0x10A04, 0x10A04}, {0x10A07, 0x10A0B}, {0x10A14, 0x10A14}, {0x10A18, 0x10A18}, {0x10A36, 0x10A37}, {0x10A3B, 0x10A3E},
-{0x10A49, 0x10A4F}, {0x10A59, 0x10A5F}, {0x10AA0, 0x10ABF}, {0x10AE7, 0x10AEA}, {0x10AF7, 0x10AFF}, {0x10B36, 0x10B38}, {0x10B56, 0x10B57}, {0x10B73, 0x10B77}, {0x10B92, 0x10B98}, {0x10B9D, 0x10BA8},
-{0x10BB0, 0x10BFF}, {0x10C49, 0x10C7F}, {0x10CB3, 0x10CBF}, {0x10CF3, 0x10CF9}, {0x10D28, 0x10D2F}, {0x10D3A, 0x10E5F}, {0x10E7F, 0x10E7F}, {0x10EAA, 0x10EAA}, {0x10EAE, 0x10EAF}, {0x10EB2, 0x10EFF},
-{0x10F28, 0x10F2F}, {0x10F5A, 0x10FAF}, {0x10FCC, 0x10FDF}, {0x10FF7, 0x10FFF}, {0x1104E, 0x11051}, {0x11070, 0x1107E}, {0x110BD, 0x110BD}, {0x110C2, 0x110CF}, {0x110E9, 0x110EF}, {0x110FA, 0x110FF},
-{0x11135, 0x11135}, {0x11148, 0x1114F}, {0x11177, 0x1117F}, {0x111E0, 0x111E0}, {0x111F5, 0x111FF}, {0x11212, 0x11212}, {0x1123F, 0x1127F}, {0x11287, 0x11287}, {0x11289, 0x11289}, {0x1128E, 0x1128E},
-{0x1129E, 0x1129E}, {0x112AA, 0x112AF}, {0x112EB, 0x112EF}, {0x112FA, 0x112FF}, {0x11304, 0x11304}, {0x1130D, 0x1130E}, {0x11311, 0x11312}, {0x11329, 0x11329}, {0x11331, 0x11331}, {0x11334, 0x11334},
-{0x1133A, 0x1133A}, {0x11345, 0x11346}, {0x11349, 0x1134A}, {0x1134E, 0x1134F}, {0x11351, 0x11356}, {0x11358, 0x1135C}, {0x11364, 0x11365}, {0x1136D, 0x1136F}, {0x11375, 0x113FF}, {0x1145C, 0x1145C},
-{0x11462, 0x1147F}, {0x114C8, 0x114CF}, {0x114DA, 0x1157F}, {0x115B6, 0x115B7}, {0x115DE, 0x115FF}, {0x11645, 0x1164F}, {0x1165A, 0x1165F}, {0x1166D, 0x1167F}, {0x116B9, 0x116BF}, {0x116CA, 0x116FF},
-{0x1171B, 0x1171C}, {0x1172C, 0x1172F}, {0x11740, 0x117FF}, {0x1183C, 0x1189F}, {0x118F3, 0x118FE}, {0x11907, 0x11908}, {0x1190A, 0x1190B}, {0x11914, 0x11914}, {0x11917, 0x11917}, {0x11936, 0x11936},
-{0x11939, 0x1193A}, {0x11947, 0x1194F}, {0x1195A, 0x1199F}, {0x119A8, 0x119A9}, {0x119D8, 0x119D9}, {0x119E5, 0x119FF}, {0x11A48, 0x11A4F}, {0x11AA3, 0x11ABF}, {0x11AF9, 0x11BFF}, {0x11C09, 0x11C09},
-{0x11C37, 0x11C37}, {0x11C46, 0x11C4F}, {0x11C6D, 0x11C6F}, {0x11C90, 0x11C91}, {0x11CA8, 0x11CA8}, {0x11CB7, 0x11CFF}, {0x11D07, 0x11D07}, {0x11D0A, 0x11D0A}, {0x11D37, 0x11D39}, {0x11D3B, 0x11D3B},
-{0x11D3E, 0x11D3E}, {0x11D48, 0x11D4F}, {0x11D5A, 0x11D5F}, {0x11D66, 0x11D66}, {0x11D69, 0x11D69}, {0x11D8F, 0x11D8F}, {0x11D92, 0x11D92}, {0x11D99, 0x11D9F}, {0x11DAA, 0x11EDF}, {0x11EF9, 0x11FAF},
-{0x11FB1, 0x11FBF}, {0x11FF2, 0x11FFE}, {0x1239A, 0x123FF}, {0x1246F, 0x1246F}, {0x12475, 0x1247F}, {0x12544, 0x12FFF}, {0x1342F, 0x143FF}, {0x14647, 0x167FF}, {0x16A39, 0x16A3F}, {0x16A5F, 0x16A5F},
-{0x16A6A, 0x16A6D}, {0x16A70, 0x16ACF}, {0x16AEE, 0x16AEF}, {0x16AF6, 0x16AFF}, {0x16B46, 0x16B4F}, {0x16B5A, 0x16B5A}, {0x16B62, 0x16B62}, {0x16B78, 0x16B7C}, {0x16B90, 0x16E3F}, {0x16E9B, 0x16EFF},
-{0x16F4B, 0x16F4E}, {0x16F88, 0x16F8E}, {0x16FA0, 0x16FDF}, {0x16FE5, 0x16FEF}, {0x16FF2, 0x16FFF}, {0x187F8, 0x187FF}, {0x18CD6, 0x18CFF}, {0x18D09, 0x1AFFF}, {0x1B11F, 0x1B14F}, {0x1B153, 0x1B163},
-{0x1B168, 0x1B16F}, {0x1B2FC, 0x1BBFF}, {0x1BC6B, 0x1BC6F}, {0x1BC7D, 0x1BC7F}, {0x1BC89, 0x1BC8F}, {0x1BC9A, 0x1BC9B}, {0x1BCA0, 0x1CFFF}, {0x1D0F6, 0x1D0FF}, {0x1D127, 0x1D128}, {0x1D173, 0x1D17A},
-{0x1D1E9, 0x1D1FF}, {0x1D246, 0x1D2DF}, {0x1D2F4, 0x1D2FF}, {0x1D357, 0x1D35F}, {0x1D379, 0x1D3FF}, {0x1D455, 0x1D455}, {0x1D49D, 0x1D49D}, {0x1D4A0, 0x1D4A1}, {0x1D4A3, 0x1D4A4}, {0x1D4A7, 0x1D4A8},
-{0x1D4AD, 0x1D4AD}, {0x1D4BA, 0x1D4BA}, {0x1D4BC, 0x1D4BC}, {0x1D4C4, 0x1D4C4}, {0x1D506, 0x1D506}, {0x1D50B, 0x1D50C}, {0x1D515, 0x1D515}, {0x1D51D, 0x1D51D}, {0x1D53A, 0x1D53A}, {0x1D53F, 0x1D53F},
-{0x1D545, 0x1D545}, {0x1D547, 0x1D549}, {0x1D551, 0x1D551}, {0x1D6A6, 0x1D6A7}, {0x1D7CC, 0x1D7CD}, {0x1DA8C, 0x1DA9A}, {0x1DAA0, 0x1DAA0}, {0x1DAB0, 0x1DFFF}, {0x1E007, 0x1E007}, {0x1E019, 0x1E01A},
-{0x1E022, 0x1E022}, {0x1E025, 0x1E025}, {0x1E02B, 0x1E0FF}, {0x1E12D, 0x1E12F}, {0x1E13E, 0x1E13F}, {0x1E14A, 0x1E14D}, {0x1E150, 0x1E2BF}, {0x1E2FA, 0x1E2FE}, {0x1E300, 0x1E7FF}, {0x1E8C5, 0x1E8C6},
-{0x1E8D7, 0x1E8FF}, {0x1E94C, 0x1E94F}, {0x1E95A, 0x1E95D}, {0x1E960, 0x1EC70}, {0x1ECB5, 0x1ED00}, {0x1ED3E, 0x1EDFF}, {0x1EE04, 0x1EE04}, {0x1EE20, 0x1EE20}, {0x1EE23, 0x1EE23}, {0x1EE25, 0x1EE26},
-{0x1EE28, 0x1EE28}, {0x1EE33, 0x1EE33}, {0x1EE38, 0x1EE38}, {0x1EE3A, 0x1EE3A}, {0x1EE3C, 0x1EE41}, {0x1EE43, 0x1EE46}, {0x1EE48, 0x1EE48}, {0x1EE4A, 0x1EE4A}, {0x1EE4C, 0x1EE4C}, {0x1EE50, 0x1EE50},
-{0x1EE53, 0x1EE53}, {0x1EE55, 0x1EE56}, {0x1EE58, 0x1EE58}, {0x1EE5A, 0x1EE5A}, {0x1EE5C, 0x1EE5C}, {0x1EE5E, 0x1EE5E}, {0x1EE60, 0x1EE60}, {0x1EE63, 0x1EE63}, {0x1EE65, 0x1EE66}, {0x1EE6B, 0x1EE6B},
-{0x1EE73, 0x1EE73}, {0x1EE78, 0x1EE78}, {0x1EE7D, 0x1EE7D}, {0x1EE7F, 0x1EE7F}, {0x1EE8A, 0x1EE8A}, {0x1EE9C, 0x1EEA0}, {0x1EEA4, 0x1EEA4}, {0x1EEAA, 0x1EEAA}, {0x1EEBC, 0x1EEEF}, {0x1EEF2, 0x1EFFF},
-{0x1F02C, 0x1F02F}, {0x1F094, 0x1F09F}, {0x1F0AF, 0x1F0B0}, {0x1F0C0, 0x1F0C0}, {0x1F0D0, 0x1F0D0}, {0x1F0F6, 0x1F0FF}, {0x1F1AE, 0x1F1E5}, {0x1F203, 0x1F20F}, {0x1F23C, 0x1F23F}, {0x1F249, 0x1F24F},
-{0x1F252, 0x1F25F}, {0x1F266, 0x1F2FF}, {0x1F6D8, 0x1F6DF}, {0x1F6ED, 0x1F6EF}, {0x1F6FD, 0x1F6FF}, {0x1F774, 0x1F77F}, {0x1F7D9, 0x1F7DF}, {0x1F7EC, 0x1F7FF}, {0x1F80C, 0x1F80F}, {0x1F848, 0x1F84F},
-{0x1F85A, 0x1F85F}, {0x1F888, 0x1F88F}, {0x1F8AE, 0x1F8AF}, {0x1F8B2, 0x1F8FF}, {0x1F979, 0x1F979}, {0x1F9CC, 0x1F9CC}, {0x1FA54, 0x1FA5F}, {0x1FA6E, 0x1FA6F}, {0x1FA75, 0x1FA77}, {0x1FA7B, 0x1FA7F},
-{0x1FA87, 0x1FA8F}, {0x1FAA9, 0x1FAAF}, {0x1FAB7, 0x1FABF}, {0x1FAC3, 0x1FACF}, {0x1FAD7, 0x1FAFF}, {0x1FB93, 0x1FB93}, {0x1FBCB, 0x1FBEF}, {0x1FBFA, 0x1FFFF}, {0x2A6DE, 0x2A6FF}, {0x2B735, 0x2B73F},
-{0x2B81E, 0x2B81F}, {0x2CEA2, 0x2CEAF}, {0x2EBE1, 0x2F7FF}, {0x2FA1E, 0x2FFFF}, {0x3134B, 0xE00FF}, {0xE01F0, 0x10FFFF},
-};
-
-//String
-bool CNCTString::operator==(const std::string& other) const {
-    return str.compare(other) == 0;
-}
-bool CNCTString::operator==(const char other) const {
-    return str.compare(std::string(1, other)) == 0;
-}
-bool CNCTString::operator==(const CNCTString& other) const {
-    return str.compare(other.str) == 0;
-}
-// + operators
-CNCTString& CNCTString::operator+=(const std::string& other) {
-    str += other;
-    int new_len = CNCTUnicode::strlen_utf8(other);
-    utf8_chars += new_len;
-    char_type = CNCTUnicode::string_identify(str);
-    seq_offset_bytes += other.size();
-    seq_offset_utf8_chars += new_len;
-    return *this;
-}
-
-CNCTString& CNCTString::operator+=(const char other) {
-    std::string str = std::string(1, other);
-    *this += str;
-    return *this;
-}
-
-CNCTString& CNCTString::operator+=(const CNCTString& other) {
-    str += other.str;
-    utf8_chars += other.utf8_chars;
-    char_type = CNCTUnicode::string_identify(str);
-    seq_offset_bytes += other.str.size();
-    seq_offset_utf8_chars += other.utf8_chars;
-    return *this;
-}
-
-struct CRCompare {
-    bool operator()(const std::pair<int, int>& p, int i) {
-        return p.second < i;
-    }
-    bool operator()(int i, const std::pair<int, int>& p) {
-        return i < p.first;
-    }
-};
-
-// binary search for code range
-bool CNCTUnicode::check_code_range(int c, const std::vector<std::pair<int, int>> &ranges) {
-    auto it = std::upper_bound(ranges.begin(), ranges.end(), c, CRCompare());
-    if (it != ranges.begin()) {
-        --it;
-    }
-    return c >= it->first && c <= it->second;
-}
-
-// these are binary searches, it takes only a few operations
-CNCTCharType CNCTUnicode::get_code_type(int c) {
-    if (check_code_range(c, letter_ranges)) {
-        return LETTER;
-    }
-    if (check_code_range(c, digit_ranges)) {
-        return DIGIT;
-    }
-    if (check_code_range(c, whitespace_ranges)) {
-        return WHITESPACE;
-    }
-    if (check_code_range(c, punctuation_ranges)) {
-        return PUNCTUATION;
-    }
-    if (check_code_range(c, symbol_ranges)) {
-        return SYMBOL;
-    }
-    if (check_code_range(c, accent_mark_ranges)) {
-        return ACCENT_MARK;
-    }
-    if (check_code_range(c, control_ranges)) {
-        return CONTROL;
-    }
-    return UNIDENTIFIED;
-}
-
-static int utf8_to_unicode(const std::string& utf8_char) {
-    int c = 0;
-    int len = (int)utf8_char.size();
-    if (len == 1) {
-        c = utf8_char[0];
-    } else if (len == 2) {
-        c = ((utf8_char[0] & 0x1F) << 6) | (utf8_char[1] & 0x3F);
-    } else if (len == 3) {
-        c = ((utf8_char[0] & 0x0F) << 12) | ((utf8_char[1] & 0x3F) << 6) | (utf8_char[2] & 0x3F);
-    } else if (len == 4) {
-        c = ((utf8_char[0] & 0x07) << 18) | ((utf8_char[1] & 0x3F) << 12) | ((utf8_char[2] & 0x3F) << 6) | (utf8_char[3] & 0x3F);
-    }
-    return c;
-}
-
-CNCTCharType CNCTUnicode::get_code_type(const std::string &utf8_char) {
-    return get_code_type(utf8_to_unicode(utf8_char));
-}
-
-int CNCTUnicode::utf8_len(const char c)
-{
-    if ((c & 0x80) == 0) {
-        return 1; // ASCII character
-    }
-    if ((c & 0xE0) == 0xC0) {
-        return 2; // 2-byte character
-    }
-    if ((c & 0xF0) == 0xE0) {
-        return 3; // 3-byte character
-    }
-    if ((c & 0xF0) == 0xF0) {
-        return 4; // 4-byte character
-    }
-    return 1;     // not valid utf8
-    // static const uint8_t lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4 };
-    // return lookup[static_cast<uint8_t>(c) >> 4];
-}
-
-int CNCTUnicode::strlen_utf8(const std::string src) {
-    int len = 0;
-    for (std::string::const_iterator it = src.begin(); it != src.end(); ++it) {
-        int char_len = utf8_len(*it);
-        if (char_len > 1) {
-            it += char_len - 1;
-        }
-        len += 1;
-    }
-    return len;
-}
-
-// split a string into unicode strings
-std::vector<std::string> CNCTUnicode::split_utf8(const std::string &src) {
-    std::vector<std::string> result;
-    for (std::string::const_iterator it = src.begin(); it != src.end(); ++it) {
-        int char_len = utf8_len(*it);
-        std::string str(it, it + char_len);
-        result.push_back(str);
-        if (char_len > 1) {
-            it += char_len - 1;
-        }
-    }
-    return result;
-}
-
-// split a string into unicode strings (CNCTString) with sequence information
-std::vector<CNCTString> CNCTUnicode::split_utf8_enhanced(const std::string &src) {
-    std::vector<CNCTString> result;
-    int seq_offset_bytes=0;
-    int seq_offset_utf8_chars=0;
-    for (std::string::const_iterator it = src.begin(); it != src.end(); ++it) {
-        int char_len = utf8_len(*it);
-        std::string str(it, it + char_len);
-        CNCTString cnct_str;
-        cnct_str.seq_offset_bytes = seq_offset_bytes;
-        cnct_str.seq_offset_utf8_chars = seq_offset_utf8_chars;
-        cnct_str.str = str;
-        cnct_str.utf8_chars = 1;
-        cnct_str.char_type = get_code_type(str);
-        #if 0
-        switch (cnct_str.char_type)
-        {
-        case DIGIT:
-            printf("%s = DIGIT\n", str.c_str());
-            break;
-        case LETTER:
-            printf("%s = LETTER\n", str.c_str());
-            break;
-        case WHITESPACE:
-            printf("%s = WHITESPACE\n", str.c_str());
-            break;
-        case PUNCTUATION:
-            printf("%s = PUNCTUATION\n", str.c_str());
-            break;
-        case UNIDENTIFIED:
-            printf("%s = UNIDENTIFIED\n", str.c_str());
-            break;
-        case SYMBOL:
-            printf("%s = SYMBOL\n", str.c_str());
-            break;
-        case CONTROL:
-            printf("%s = CONTROL\n", str.c_str());
-            break;
-        }
-        #endif
-
-        result.push_back(cnct_str);
-        seq_offset_bytes += char_len;
-        seq_offset_utf8_chars += 1;
-        if (char_len > 1) {
-            it += char_len - 1;
-        }
-
-    }
-    return result;
-}
-
-// return the type of the string
-CNCTCharType CNCTUnicode::string_identify(const std::string &str) {
-    CNCTCharType result = UNIDENTIFIED;
-    std::string::const_iterator it = str.begin();
-    while (it != str.end()) {
-        int len = utf8_len(*it);
-        int c = 0;
-        for (int i = 0; i < len && it != str.end(); ++i, ++it) {
-            c = (c << 8) | static_cast<unsigned char>(*it);
-        }
-        switch (get_code_type(c)) {
-        case DIGIT:
-            if (result == UNIDENTIFIED) {
-                result = DIGIT;
-            } else if (result != DIGIT) {
-                return MIXED;
-            }
-            break;
-        case LETTER:
-            if (result == UNIDENTIFIED) {
-                result = LETTER;
-            } else if (result != LETTER) {
-                return MIXED;
-            }
-            break;
-        case WHITESPACE:
-            if (result == UNIDENTIFIED) {
-                result = WHITESPACE;
-            } else if (result != WHITESPACE) {
-                return MIXED;
-            }
-            break;
-        case PUNCTUATION:
-            if (result == UNIDENTIFIED) {
-                result = PUNCTUATION;
-            } else if (result != PUNCTUATION) {
-                return MIXED;
-            }
-            break;
-        default:
-            return MIXED;
-            break;
-        }
-    }
-    return result;
-}
-
-// verify the content of a string
-bool CNCTUnicode::string_test(const std::string &str, CNCTCharType chartype)
-{
-    std::string::const_iterator it = str.begin();
-    while (it != str.end()) {
-        int len = utf8_len(*it);
-        int c = 0;
-        for (int i = 0; i < len && it != str.end(); ++i, ++it) {
-            c = (c << 8) | static_cast<unsigned char>(*it);
-        }
-        if (get_code_type(c) != chartype) {
-            return false;
-        }
-    }
-    return true;
-}
-
-//-----------------
-// llama.cpp GPT2 vocab (from libfalcon.cpp)
-//-----------------
-
-std::string replaceAll(std::string str, const std::string& from, const std::string& to) {
-    size_t start_pos = 0;
-    while((start_pos = str.find(from, start_pos)) != std::string::npos) {
-        str.replace(start_pos, from.length(), to);
-        start_pos += to.length(); // Handles case where 'to' is a substring of 'from'
-    }
-    return str;
-}
-
-struct TrieNode {
-    std::map<char, TrieNode*> map;
-    int32_t Id = -1;
-};
-
-struct Trie {
-    TrieNode *root;
-
-    Trie() : root(new TrieNode()) {}
-
-    ~Trie() {
-        if(root)
-        deleteTrie(root);
-    }
-
-    // Move constructor
-    Trie(Trie&& other) noexcept : root(other.root) {
-        other.root = nullptr;
-    }
-
-    // Move assignment operator
-    Trie& operator=(Trie&& other) noexcept {
-        if (this != &other) {
-            if(root)
-                deleteTrie(root);
-            root = other.root;
-            other.root = nullptr;
-        }
-        return *this;
-    }
-
-    void insert(const std::string &token, int32_t Id) {
-        TrieNode* current = root;
-        for(auto ch : token) {
-            if(current->map.find(ch) == current->map.end()) {
-                current->map[ch] = new TrieNode();
-            }
-            current = current->map[ch];
-        }
-        current->Id = Id;
-    }
-
-    void reset() {
-        deleteTrie(root);
-        root = new TrieNode();
-    }
-
-private:
-    void deleteTrie(TrieNode* node) {
-        for(auto &it: node->map) {
-            deleteTrie(it.second);
-        }
-        delete node;
-    }
-
-};
-
-struct gpt2bpe_vocab {
-    using id = int32_t;
-    using token = std::string;
-
-    std::map<std::string, uint32_t> max_token_length; // max length, for each 2byte prefix
-    std::map<std::pair<std::string,std::string>, int> bpe_ranks;
-    std::vector<std::pair<std::string, std::string>> bpe_merges;
-
-    id special_bos_id = -1;
-    id special_eos_id = -1;
-    id special_unk_id = -1;
-    id special_sep_id = -1;
-    id special_pad_id = -1;
-
-    id linefeed_id = -1;
-
-    std::unordered_map<token, id> token_to_id;
-    std::unordered_map<id, token> id_to_token;
-
-    Trie trie; // highspeed access to tokens by prefix tree
-
-    // populate trie from map
-    void populate_trie_from_map() {
-        trie.reset();
-        for (const auto& pair : token_to_id) {
-            trie.insert(pair.first, pair.second);
-            if (pair.first.size() >= 2) {
-                std::string prefix = pair.first.substr(0, 2);
-                max_token_length[prefix] = std::max(max_token_length[prefix], (uint32_t)pair.first.size());
-            }
-        }
-    }
-    // populate token ranks map
-    int populate_bpe_ranks(std::vector<std::pair<std::string, std::string>> bpe_merges_) {
-        for (int i = 0; i < (int)bpe_merges_.size(); i++) {
-            bpe_ranks.emplace(bpe_merges_[i], i);
-        }
-        bpe_merges = bpe_merges_;
-        return bpe_merges_.size();
-    }
-
-    // Trim whitespace characters from the beginning and end of the string
-    void trim(std::string& str) {
-        // Remove whitespace characters from the beginning of the string
-        str.erase(str.begin(), std::find_if(str.begin(), str.end(), [](int ch) {
-            return !std::isspace(ch);
-        }));
-
-        // Remove whitespace characters from the end of the string
-        str.erase(std::find_if(str.rbegin(), str.rend(), [](int ch) {
-            return !std::isspace(ch);
-        }).base(), str.end());
-    }
-
-    // get max token length available for a prefix of 2 bytes (string at least 2 bytes long)
-    int get_max_token_length(const std::string& string) const {
-        if (string.size() < 2) {
-            return -1;
-        }
-        std::string prefix = string.substr(0, 2);
-        if (max_token_length.find(prefix) == max_token_length.end()) {
-            return 0;
-        }
-        return max_token_length.at(prefix);
-    }
-
-    // function to find if two tokens match in bpe_rank, return rank or -1
-    int find_bpe_rank(const std::string& token1, const std::string& token2) const {
-        std::string left_token = token1;
-        std::string right_token = token2;
-        left_token = replaceAll(left_token, " ", "Ġ");
-        left_token = replaceAll(left_token, "\n", "Ċ");
-        right_token = replaceAll(right_token, " ", "Ġ");
-        right_token = replaceAll(right_token, "\n", "Ċ");
-
-        auto it = bpe_ranks.find(std::make_pair(left_token, right_token));
-        if (it == bpe_ranks.end()) {
-            return -1;
-        }
-        return it->second;
-    }
-
-    std::pair<gpt2bpe_vocab::id, std::string> find_longest_match(const std::string& snippet) const {
-        TrieNode* current = trie.root;
-        gpt2bpe_vocab::id last_matched_id = -1;
-        std::string last_matched_token = "";
-        std::string current_token = "";
-        for (auto ch : snippet) {
-            if (current->map.find(ch) == current->map.end()) {
-                break;
-            }
-            current = current->map[ch];
-            current_token += ch;
-            if (current->Id != -1) {
-                last_matched_id = current->Id;
-                last_matched_token = current_token;
-            }
-        }
-        return {last_matched_id, last_matched_token};
-    }
-
-};
-
-
-//
-// tokenizer - bpe type, gpt2 tokenization compatible
-//
-
-struct ggllm_bpe_symbol {
-    using index = int;
-    index prev;
-    index next;
-    const char * text;
-    size_t n;
-};
-
-static_assert(std::is_trivially_copyable<ggllm_bpe_symbol>::value, "ggllm_bpe_symbol is not trivially copyable");
-
-struct ggllm_bpe_bigram {
-    struct comparator {
-        bool operator()(ggllm_bpe_bigram & l, ggllm_bpe_bigram & r) {
-            return l.rank > r.rank || (l.rank == r.rank && l.left > r.left);
-        }
-    };
-
-    using queue_storage = std::vector<ggllm_bpe_bigram>;
-    using queue = std::priority_queue<ggllm_bpe_bigram, queue_storage, comparator>;
-    ggllm_bpe_symbol::index left;
-    ggllm_bpe_symbol::index right;
-    std::string text;
-    int rank;
-    size_t size;
-};
-
-struct gpt2bpe_tokenizer {
-    gpt2bpe_tokenizer(const gpt2bpe_vocab & vocab, bool g2ws_): vocab_(vocab) { flag_g2ws = g2ws_; }
-
-    void tokenize(const std::string & text, std::vector<gpt2bpe_vocab::id> & output) {
-        int final_prev_index = -1;
-        // auto start = ggml_time_us();
-        auto word_collection = bpe_gpt2_preprocess(text);
-        // auto end = ggml_time_us();
-        // fprintf(stderr, "%s: preprocessing took %0.3f ms\n", __func__, (end - start) / 1000.0);
-
-        symbols_final.clear();
-
-        for (auto & word : word_collection) {
-            work_queue_ = ggllm_bpe_bigram::queue();
-            symbols_.clear();
-
-            int index = 0;
-            size_t offset = 0;
-
-            while (offset < word.size()) {
-                ggllm_bpe_symbol sym;
-                size_t char_len = std::min(word.size() - offset, (size_t) CNCTUnicode::utf8_len(word[offset]));
-                sym.text = word.c_str() + offset;
-                sym.n = 1;
-                sym.n = char_len;
-                offset += sym.n;
-                sym.prev = index - 1;
-                sym.next = offset == word.size() ? -1 : index + 1;
-                index++;
-                symbols_.emplace_back(sym);
-            }
-            for (size_t i = 1; i < symbols_.size(); ++i) {
-                add_new_bigram(i - 1, i);
-            }
-
-            // build token(s)
-            while (!work_queue_.empty()) {
-                auto bigram = work_queue_.top();
-                work_queue_.pop();
-
-                auto & left_symbol = symbols_[bigram.left];
-                auto & right_symbol = symbols_[bigram.right];
-
-                if (left_symbol.n == 0 || right_symbol.n == 0) {
-                    continue;
-                }
-                std::string left_token = std::string(left_symbol.text, left_symbol.n);
-                std::string right_token = std::string(right_symbol.text, right_symbol.n);
-                if (left_token + right_token != bigram.text) {
-                    continue;  // Skip this bigram if it's outdated
-                }
-
-                // merge the right sym into the left one
-                left_symbol.n += right_symbol.n;
-                right_symbol.n = 0;
-
-                // remove the right sym from the chain
-                left_symbol.next = right_symbol.next;
-                if (right_symbol.next >= 0) {
-                    symbols_[right_symbol.next].prev = bigram.left;
-                }
-
-                add_new_bigram(left_symbol.prev, bigram.left);  // left side of current symbol
-                add_new_bigram(bigram.left, left_symbol.next);  // right side of current symbol
-            }
-
-            // add the fnished tokens to the final list keeping correct order for next and prev
-            for (auto & sym : symbols_) {
-                if (sym.n > 0) {
-                    sym.prev = final_prev_index;
-                    sym.next = -1;
-                    if (final_prev_index != -1) {
-                        symbols_final[final_prev_index].next = symbols_final.size();
-                    }
-                    symbols_final.emplace_back(sym);
-                    final_prev_index = symbols_final.size() - 1;
-                }
-            }
-        }
-
-        symbols_ = symbols_final;
-        if (symbols_.size())
-        for (int i = 0; i != -1; i = symbols_[i].next) {
-            auto & symbol = symbols_[i];
-            if (symbol.n == 0) {
-                continue;
-            }
-            std::string str = std::string(symbol.text, symbol.n);
-            std::string str_decoded = decode_token(str);
-            auto token = vocab_.token_to_id.find(str_decoded);
-
-            if (token == vocab_.token_to_id.end()) {
-                for (auto j = str_decoded.begin(); j != str_decoded.end(); ++j) {
-                    std::string byte_str(1, *j);
-                    auto token_multibyte = vocab_.token_to_id.find(byte_str);
-                    if (token_multibyte == vocab_.token_to_id.end()) {
-                        fprintf(stderr,"ERROR: byte not found in vocab: '%s'\n", byte_str.c_str());
-                    }
-                    output.push_back((*token_multibyte).second);
-                }
-            } else {
-                output.push_back((*token).second);
-            }
-        }
-    }
-
-private:
-    void add_new_bigram(int left, int right) {
-        if (left == -1 || right == -1)  return;
-
-        std::string left_token = std::string(symbols_[left].text, symbols_[left].n);
-        std::string right_token = std::string(symbols_[right].text, symbols_[right].n);
-
-        int rank_found = -1;
-        rank_found = vocab_.find_bpe_rank(left_token, right_token);
-
-        if (rank_found < 0) {
-            return;
-        }
-
-        ggllm_bpe_bigram bigram;
-        bigram.left = left;
-        bigram.right = right;
-        bigram.rank = rank_found;
-        bigram.size = left_token.size() + right_token.size();
-        bigram.text = left_token + right_token;
-        work_queue_.push(bigram);
-    }
-
-    std::unordered_map<unsigned char, std::string> bytes_to_unicode() {
-        static std::unordered_map<unsigned char, std::string> hex_map = {
-            { 0x21, "\x21" }, { 0x22, "\x22" }, { 0x23, "\x23" }, { 0x24, "\x24" }, { 0x25, "\x25" }, { 0x26, "\x26" }, { 0x27, "\x27" }, { 0x28, "\x28" }, { 0x29, "\x29" }, { 0x2A, "\x2A" },
-            { 0x2B, "\x2B" }, { 0x2C, "\x2C" }, { 0x2D, "\x2D" }, { 0x2E, "\x2E" }, { 0x2F, "\x2F" }, { 0x30, "\x30" }, { 0x31, "\x31" }, { 0x32, "\x32" }, { 0x33, "\x33" }, { 0x34, "\x34" },
-            { 0x35, "\x35" }, { 0x36, "\x36" }, { 0x37, "\x37" }, { 0x38, "\x38" }, { 0x39, "\x39" }, { 0x3A, "\x3A" }, { 0x3B, "\x3B" }, { 0x3C, "\x3C" }, { 0x3D, "\x3D" }, { 0x3E, "\x3E" },
-            { 0x3F, "\x3F" }, { 0x40, "\x40" }, { 0x41, "\x41" }, { 0x42, "\x42" }, { 0x43, "\x43" }, { 0x44, "\x44" }, { 0x45, "\x45" }, { 0x46, "\x46" }, { 0x47, "\x47" }, { 0x48, "\x48" },
-            { 0x49, "\x49" }, { 0x4A, "\x4A" }, { 0x4B, "\x4B" }, { 0x4C, "\x4C" }, { 0x4D, "\x4D" }, { 0x4E, "\x4E" }, { 0x4F, "\x4F" }, { 0x50, "\x50" }, { 0x51, "\x51" }, { 0x52, "\x52" },
-            { 0x53, "\x53" }, { 0x54, "\x54" }, { 0x55, "\x55" }, { 0x56, "\x56" }, { 0x57, "\x57" }, { 0x58, "\x58" }, { 0x59, "\x59" }, { 0x5A, "\x5A" }, { 0x5B, "\x5B" }, { 0x5C, "\x5C" },
-            { 0x5D, "\x5D" }, { 0x5E, "\x5E" }, { 0x5F, "\x5F" }, { 0x60, "\x60" }, { 0x61, "\x61" }, { 0x62, "\x62" }, { 0x63, "\x63" }, { 0x64, "\x64" }, { 0x65, "\x65" }, { 0x66, "\x66" },
-            { 0x67, "\x67" }, { 0x68, "\x68" }, { 0x69, "\x69" }, { 0x6A, "\x6A" }, { 0x6B, "\x6B" }, { 0x6C, "\x6C" }, { 0x6D, "\x6D" }, { 0x6E, "\x6E" }, { 0x6F, "\x6F" }, { 0x70, "\x70" },
-            { 0x71, "\x71" }, { 0x72, "\x72" }, { 0x73, "\x73" }, { 0x74, "\x74" }, { 0x75, "\x75" }, { 0x76, "\x76" }, { 0x77, "\x77" }, { 0x78, "\x78" }, { 0x79, "\x79" }, { 0x7A, "\x7A" },
-            { 0x7B, "\x7B" }, { 0x7C, "\x7C" }, { 0x7D, "\x7D" }, { 0x7E, "\x7E" }, { 0xA1, "\xC2\xA1" }, { 0xA2, "\xC2\xA2" }, { 0xA3, "\xC2\xA3" }, { 0xA4, "\xC2\xA4" }, { 0xA5, "\xC2\xA5" },
-            { 0xA6, "\xC2\xA6" }, { 0xA7, "\xC2\xA7" }, { 0xA8, "\xC2\xA8" }, { 0xA9, "\xC2\xA9" }, { 0xAA, "\xC2\xAA" }, { 0xAB, "\xC2\xAB" }, { 0xAC, "\xC2\xAC" }, { 0xAE, "\xC2\xAE" },
-            { 0xAF, "\xC2\xAF" }, { 0xB0, "\xC2\xB0" }, { 0xB1, "\xC2\xB1" }, { 0xB2, "\xC2\xB2" }, { 0xB3, "\xC2\xB3" }, { 0xB4, "\xC2\xB4" }, { 0xB5, "\xC2\xB5" }, { 0xB6, "\xC2\xB6" },
-            { 0xB7, "\xC2\xB7" }, { 0xB8, "\xC2\xB8" }, { 0xB9, "\xC2\xB9" }, { 0xBA, "\xC2\xBA" }, { 0xBB, "\xC2\xBB" }, { 0xBC, "\xC2\xBC" }, { 0xBD, "\xC2\xBD" }, { 0xBE, "\xC2\xBE" },
-            { 0xBF, "\xC2\xBF" }, { 0xC0, "\xC3\x80" }, { 0xC1, "\xC3\x81" }, { 0xC2, "\xC3\x82" }, { 0xC3, "\xC3\x83" }, { 0xC4, "\xC3\x84" }, { 0xC5, "\xC3\x85" }, { 0xC6, "\xC3\x86" },
-            { 0xC7, "\xC3\x87" }, { 0xC8, "\xC3\x88" }, { 0xC9, "\xC3\x89" }, { 0xCA, "\xC3\x8A" }, { 0xCB, "\xC3\x8B" }, { 0xCC, "\xC3\x8C" }, { 0xCD, "\xC3\x8D" }, { 0xCE, "\xC3\x8E" },
-            { 0xCF, "\xC3\x8F" }, { 0xD0, "\xC3\x90" }, { 0xD1, "\xC3\x91" }, { 0xD2, "\xC3\x92" }, { 0xD3, "\xC3\x93" }, { 0xD4, "\xC3\x94" }, { 0xD5, "\xC3\x95" }, { 0xD6, "\xC3\x96" },
-            { 0xD7, "\xC3\x97" }, { 0xD8, "\xC3\x98" }, { 0xD9, "\xC3\x99" }, { 0xDA, "\xC3\x9A" }, { 0xDB, "\xC3\x9B" }, { 0xDC, "\xC3\x9C" }, { 0xDD, "\xC3\x9D" }, { 0xDE, "\xC3\x9E" },
-            { 0xDF, "\xC3\x9F" }, { 0xE0, "\xC3\xA0" }, { 0xE1, "\xC3\xA1" }, { 0xE2, "\xC3\xA2" }, { 0xE3, "\xC3\xA3" }, { 0xE4, "\xC3\xA4" }, { 0xE5, "\xC3\xA5" }, { 0xE6, "\xC3\xA6" },
-            { 0xE7, "\xC3\xA7" }, { 0xE8, "\xC3\xA8" }, { 0xE9, "\xC3\xA9" }, { 0xEA, "\xC3\xAA" }, { 0xEB, "\xC3\xAB" }, { 0xEC, "\xC3\xAC" }, { 0xED, "\xC3\xAD" }, { 0xEE, "\xC3\xAE" },
-            { 0xEF, "\xC3\xAF" }, { 0xF0, "\xC3\xB0" }, { 0xF1, "\xC3\xB1" }, { 0xF2, "\xC3\xB2" }, { 0xF3, "\xC3\xB3" }, { 0xF4, "\xC3\xB4" }, { 0xF5, "\xC3\xB5" }, { 0xF6, "\xC3\xB6" },
-            { 0xF7, "\xC3\xB7" }, { 0xF8, "\xC3\xB8" }, { 0xF9, "\xC3\xB9" }, { 0xFA, "\xC3\xBA" }, { 0xFB, "\xC3\xBB" }, { 0xFC, "\xC3\xBC" }, { 0xFD, "\xC3\xBD" }, { 0xFE, "\xC3\xBE" },
-            { 0xFF, "\xC3\xBF" }, { 0x00, "\xC4\x80" }, { 0x01, "\xC4\x81" }, { 0x02, "\xC4\x82" }, { 0x03, "\xC4\x83" }, { 0x04, "\xC4\x84" }, { 0x05, "\xC4\x85" }, { 0x06, "\xC4\x86" },
-            { 0x07, "\xC4\x87" }, { 0x08, "\xC4\x88" }, { 0x09, "\xC4\x89" }, { 0x0A, "\xC4\x8A" }, { 0x0B, "\xC4\x8B" }, { 0x0C, "\xC4\x8C" }, { 0x0D, "\xC4\x8D" }, { 0x0E, "\xC4\x8E" },
-            { 0x0F, "\xC4\x8F" }, { 0x10, "\xC4\x90" }, { 0x11, "\xC4\x91" }, { 0x12, "\xC4\x92" }, { 0x13, "\xC4\x93" }, { 0x14, "\xC4\x94" }, { 0x15, "\xC4\x95" }, { 0x16, "\xC4\x96" },
-            { 0x17, "\xC4\x97" }, { 0x18, "\xC4\x98" }, { 0x19, "\xC4\x99" }, { 0x1A, "\xC4\x9A" }, { 0x1B, "\xC4\x9B" }, { 0x1C, "\xC4\x9C" }, { 0x1D, "\xC4\x9D" }, { 0x1E, "\xC4\x9E" },
-            { 0x1F, "\xC4\x9F" }, { 0x20, "\xC4\xA0" }, { 0x7F, "\xC4\xA1" }, { 0x80, "\xC4\xA2" }, { 0x81, "\xC4\xA3" }, { 0x82, "\xC4\xA4" }, { 0x83, "\xC4\xA5" }, { 0x84, "\xC4\xA6" },
-            { 0x85, "\xC4\xA7" }, { 0x86, "\xC4\xA8" }, { 0x87, "\xC4\xA9" }, { 0x88, "\xC4\xAA" }, { 0x89, "\xC4\xAB" }, { 0x8A, "\xC4\xAC" }, { 0x8B, "\xC4\xAD" }, { 0x8C, "\xC4\xAE" },
-            { 0x8D, "\xC4\xAF" }, { 0x8E, "\xC4\xB0" }, { 0x8F, "\xC4\xB1" }, { 0x90, "\xC4\xB2" }, { 0x91, "\xC4\xB3" }, { 0x92, "\xC4\xB4" }, { 0x93, "\xC4\xB5" }, { 0x94, "\xC4\xB6" },
-            { 0x95, "\xC4\xB7" }, { 0x96, "\xC4\xB8" }, { 0x97, "\xC4\xB9" }, { 0x98, "\xC4\xBA" }, { 0x99, "\xC4\xBB" }, { 0x9A, "\xC4\xBC" }, { 0x9B, "\xC4\xBD" }, { 0x9C, "\xC4\xBE" },
-            { 0x9D, "\xC4\xBF" }, { 0x9E, "\xC5\x80" }, { 0x9F, "\xC5\x81" }, { 0xA0, "\xC5\x82" }, { 0xAD, "\xC5\x83" }
-        };
-        return hex_map;
-    }
-
-    std::unordered_map<std::string, unsigned char> unicode_to_bytes() {
-        static std::unordered_map<std::string, unsigned char> hex_map = {
-            { "\x21", 0x21 }, { "\x22", 0x22 }, { "\x23", 0x23 }, { "\x24", 0x24 }, { "\x25", 0x25 }, { "\x26", 0x26 }, { "\x27", 0x27 }, { "\x28", 0x28 }, { "\x29", 0x29 }, { "\x2A", 0x2A },
-            { "\x2B", 0x2B }, { "\x2C", 0x2C }, { "\x2D", 0x2D }, { "\x2E", 0x2E }, { "\x2F", 0x2F }, { "\x30", 0x30 }, { "\x31", 0x31 }, { "\x32", 0x32 }, { "\x33", 0x33 }, { "\x34", 0x34 },
-            { "\x35", 0x35 }, { "\x36", 0x36 }, { "\x37", 0x37 }, { "\x38", 0x38 }, { "\x39", 0x39 }, { "\x3A", 0x3A }, { "\x3B", 0x3B }, { "\x3C", 0x3C }, { "\x3D", 0x3D }, { "\x3E", 0x3E },
-            { "\x3F", 0x3F }, { "\x40", 0x40 }, { "\x41", 0x41 }, { "\x42", 0x42 }, { "\x43", 0x43 }, { "\x44", 0x44 }, { "\x45", 0x45 }, { "\x46", 0x46 }, { "\x47", 0x47 }, { "\x48", 0x48 },
-            { "\x49", 0x49 }, { "\x4A", 0x4A }, { "\x4B", 0x4B }, { "\x4C", 0x4C }, { "\x4D", 0x4D }, { "\x4E", 0x4E }, { "\x4F", 0x4F }, { "\x50", 0x50 }, { "\x51", 0x51 }, { "\x52", 0x52 },
-            { "\x53", 0x53 }, { "\x54", 0x54 }, { "\x55", 0x55 }, { "\x56", 0x56 }, { "\x57", 0x57 }, { "\x58", 0x58 }, { "\x59", 0x59 }, { "\x5A", 0x5A }, { "\x5B", 0x5B }, { "\x5C", 0x5C },
-            { "\x5D", 0x5D }, { "\x5E", 0x5E }, { "\x5F", 0x5F }, { "\x60", 0x60 }, { "\x61", 0x61 }, { "\x62", 0x62 }, { "\x63", 0x63 }, { "\x64", 0x64 }, { "\x65", 0x65 }, { "\x66", 0x66 },
-            { "\x67", 0x67 }, { "\x68", 0x68 }, { "\x69", 0x69 }, { "\x6A", 0x6A }, { "\x6B", 0x6B }, { "\x6C", 0x6C }, { "\x6D", 0x6D }, { "\x6E", 0x6E }, { "\x6F", 0x6F }, { "\x70", 0x70 },
-            { "\x71", 0x71 }, { "\x72", 0x72 }, { "\x73", 0x73 }, { "\x74", 0x74 }, { "\x75", 0x75 }, { "\x76", 0x76 }, { "\x77", 0x77 }, { "\x78", 0x78 }, { "\x79", 0x79 }, { "\x7A", 0x7A },
-            { "\x7B", 0x7B }, { "\x7C", 0x7C }, { "\x7D", 0x7D }, { "\x7E", 0x7E }, { "\xC2\xA1", 0xA1 }, { "\xC2\xA2", 0xA2 }, { "\xC2\xA3", 0xA3 }, { "\xC2\xA4", 0xA4 }, { "\xC2\xA5", 0xA5 },
-            { "\xC2\xA6", 0xA6 }, { "\xC2\xA7", 0xA7 }, { "\xC2\xA8", 0xA8 }, { "\xC2\xA9", 0xA9 }, { "\xC2\xAA", 0xAA }, { "\xC2\xAB", 0xAB }, { "\xC2\xAC", 0xAC }, { "\xC2\xAE", 0xAE },
-            { "\xC2\xAF", 0xAF }, { "\xC2\xB0", 0xB0 }, { "\xC2\xB1", 0xB1 }, { "\xC2\xB2", 0xB2 }, { "\xC2\xB3", 0xB3 }, { "\xC2\xB4", 0xB4 }, { "\xC2\xB5", 0xB5 }, { "\xC2\xB6", 0xB6 },
-            { "\xC2\xB7", 0xB7 }, { "\xC2\xB8", 0xB8 }, { "\xC2\xB9", 0xB9 }, { "\xC2\xBA", 0xBA }, { "\xC2\xBB", 0xBB }, { "\xC2\xBC", 0xBC }, { "\xC2\xBD", 0xBD }, { "\xC2\xBE", 0xBE },
-            { "\xC2\xBF", 0xBF }, { "\xC3\x80", 0xC0 }, { "\xC3\x81", 0xC1 }, { "\xC3\x82", 0xC2 }, { "\xC3\x83", 0xC3 }, { "\xC3\x84", 0xC4 }, { "\xC3\x85", 0xC5 }, { "\xC3\x86", 0xC6 },
-            { "\xC3\x87", 0xC7 }, { "\xC3\x88", 0xC8 }, { "\xC3\x89", 0xC9 }, { "\xC3\x8A", 0xCA }, { "\xC3\x8B", 0xCB }, { "\xC3\x8C", 0xCC }, { "\xC3\x8D", 0xCD }, { "\xC3\x8E", 0xCE },
-            { "\xC3\x8F", 0xCF }, { "\xC3\x90", 0xD0 }, { "\xC3\x91", 0xD1 }, { "\xC3\x92", 0xD2 }, { "\xC3\x93", 0xD3 }, { "\xC3\x94", 0xD4 }, { "\xC3\x95", 0xD5 }, { "\xC3\x96", 0xD6 },
-            { "\xC3\x97", 0xD7 }, { "\xC3\x98", 0xD8 }, { "\xC3\x99", 0xD9 }, { "\xC3\x9A", 0xDA }, { "\xC3\x9B", 0xDB }, { "\xC3\x9C", 0xDC }, { "\xC3\x9D", 0xDD }, { "\xC3\x9E", 0xDE },
-            { "\xC3\x9F", 0xDF }, { "\xC3\xA0", 0xE0 }, { "\xC3\xA1", 0xE1 }, { "\xC3\xA2", 0xE2 }, { "\xC3\xA3", 0xE3 }, { "\xC3\xA4", 0xE4 }, { "\xC3\xA5", 0xE5 }, { "\xC3\xA6", 0xE6 },
-            { "\xC3\xA7", 0xE7 }, { "\xC3\xA8", 0xE8 }, { "\xC3\xA9", 0xE9 }, { "\xC3\xAA", 0xEA }, { "\xC3\xAB", 0xEB }, { "\xC3\xAC", 0xEC }, { "\xC3\xAD", 0xED }, { "\xC3\xAE", 0xEE },
-            { "\xC3\xAF", 0xEF }, { "\xC3\xB0", 0xF0 }, { "\xC3\xB1", 0xF1 }, { "\xC3\xB2", 0xF2 }, { "\xC3\xB3", 0xF3 }, { "\xC3\xB4", 0xF4 }, { "\xC3\xB5", 0xF5 }, { "\xC3\xB6", 0xF6 },
-            { "\xC3\xB7", 0xF7 }, { "\xC3\xB8", 0xF8 }, { "\xC3\xB9", 0xF9 }, { "\xC3\xBA", 0xFA }, { "\xC3\xBB", 0xFB }, { "\xC3\xBC", 0xFC }, { "\xC3\xBD", 0xFD }, { "\xC3\xBE", 0xFE },
-            { "\xC3\xBF", 0xFF }, { "\xC4\x80", 0x00 }, { "\xC4\x81", 0x01 }, { "\xC4\x82", 0x02 }, { "\xC4\x83", 0x03 }, { "\xC4\x84", 0x04 }, { "\xC4\x85", 0x05 }, { "\xC4\x86", 0x06 },
-            { "\xC4\x87", 0x07 }, { "\xC4\x88", 0x08 }, { "\xC4\x89", 0x09 }, { "\xC4\x8A", 0x0A }, { "\xC4\x8B", 0x0B }, { "\xC4\x8C", 0x0C }, { "\xC4\x8D", 0x0D }, { "\xC4\x8E", 0x0E },
-            { "\xC4\x8F", 0x0F }, { "\xC4\x90", 0x10 }, { "\xC4\x91", 0x11 }, { "\xC4\x92", 0x12 }, { "\xC4\x93", 0x13 }, { "\xC4\x94", 0x14 }, { "\xC4\x95", 0x15 }, { "\xC4\x96", 0x16 },
-            { "\xC4\x97", 0x17 }, { "\xC4\x98", 0x18 }, { "\xC4\x99", 0x19 }, { "\xC4\x9A", 0x1A }, { "\xC4\x9B", 0x1B }, { "\xC4\x9C", 0x1C }, { "\xC4\x9D", 0x1D }, { "\xC4\x9E", 0x1E },
-            { "\xC4\x9F", 0x1F }, { "\xC4\xA0", 0x20 }, { "\xC4\xA1", 0x7F }, { "\xC4\xA2", 0x80 }, { "\xC4\xA3", 0x81 }, { "\xC4\xA4", 0x82 }, { "\xC4\xA5", 0x83 }, { "\xC4\xA6", 0x84 },
-            { "\xC4\xA7", 0x85 }, { "\xC4\xA8", 0x86 }, { "\xC4\xA9", 0x87 }, { "\xC4\xAA", 0x88 }, { "\xC4\xAB", 0x89 }, { "\xC4\xAC", 0x8A }, { "\xC4\xAD", 0x8B }, { "\xC4\xAE", 0x8C },
-            { "\xC4\xAF", 0x8D }, { "\xC4\xB0", 0x8E }, { "\xC4\xB1", 0x8F }, { "\xC4\xB2", 0x90 }, { "\xC4\xB3", 0x91 }, { "\xC4\xB4", 0x92 }, { "\xC4\xB5", 0x93 }, { "\xC4\xB6", 0x94 },
-            { "\xC4\xB7", 0x95 }, { "\xC4\xB8", 0x96 }, { "\xC4\xB9", 0x97 }, { "\xC4\xBA", 0x98 }, { "\xC4\xBB", 0x99 }, { "\xC4\xBC", 0x9A }, { "\xC4\xBD", 0x9B }, { "\xC4\xBE", 0x9C },
-            { "\xC4\xBF", 0x9D }, { "\xC5\x80", 0x9E }, { "\xC5\x81", 0x9F }, { "\xC5\x82", 0xA0 }, { "\xC5\x83", 0xAD }
-        };
-        return hex_map;
-    }
-
-    // len must be available
-    bool inline str_is_equal(const char* str1, const char* str2, size_t len) {
-        for (size_t i = 0; i < len; ++i) {
-            if (str1[i] != str2[i]) {
-                return false;
-            }
-        }
-        return true;
-    }
-
-    std::vector<std::string> bpe_gpt2_preprocess(const std::string& text) {
-        static std::unordered_map< unsigned char, std::string> byte_encoder = bytes_to_unicode();
-        std::vector<std::string> bpe_words;
-        std::vector<std::string> bpe_encoded_words;
-
-        std::string token="";
-        const char *raw_text_p = text.c_str();
-        // GPT2 system regex:  's|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+
-        bool collecting_numeric = false;
-        bool collecting_letter = false;
-        bool collecting_special = false;
-        bool collecting_whitespace_lookahead = false;
-        bool collecting=false;
-
-        std::vector<CNCTString> text_utf;
-        text_utf.reserve(text.size());
-        bpe_words.reserve(text.size());
-        bpe_encoded_words.reserve(text.size());
-
-        text_utf = CNCTUnicode::split_utf8_enhanced(text);
-
-        for (int i = 0; i < (int)text_utf.size(); i++) {
-            const CNCTString &utf_char = text_utf[i];
-            bool split_condition = false;
-            const char *text_pos = raw_text_p + utf_char.seq_offset_bytes;
-            int bytes_remain = strlen(text_pos);
-            // forward backward lookups
-            const CNCTString &utf_char_next = (i+1 < (int)text_utf.size()) ? text_utf[i+1] : CNCTString();
-            const CNCTString &utf_char_next_next = (i+2 < (int)text_utf.size()) ? text_utf[i+2] : CNCTString();
-            // const CNCTString &utf_char_prev = (i > 0) ? text_utf[i-1] : CNCTString();
-
-            // handling contractions
-            if (!split_condition && bytes_remain >= 2) {
-                // 's|'t|'m|'d
-                if (utf_char == '\'' && (utf_char_next == 's' || utf_char_next == 't' || utf_char_next == 'm' || utf_char_next == 'd')) {
-                    split_condition = true;
-                }
-                if (split_condition) {
-                    if (token.size()) {
-                        bpe_words.emplace_back(token); // push previous content as token
-                    }
-                    token = utf_char.str + utf_char_next.str;
-                    bpe_words.emplace_back(token);
-                    token="";
-                    i++;
-                    continue;
-                }
-            }
-            if (!split_condition && bytes_remain >= 3) {
-                // 're|'ve|'ll
-                if (utf_char == '\'' && (
-                                          (utf_char_next == 'r' || utf_char_next_next == 'e') ||
-                                          (utf_char_next == 'v' || utf_char_next_next == 'e') ||
-                                          (utf_char_next == 'l' || utf_char_next_next == 'l'))
-                                        ) {
-                    split_condition = true;
-                }
-                if (split_condition) {
-                    // current token + next token can be defined
-                    if (token.size()) {
-                        bpe_words.emplace_back(token); // push previous content as token
-                    }
-                    token = utf_char.str + utf_char_next.str + utf_char_next_next.str;
-                    bpe_words.emplace_back(token); // the contraction
-                    token="";
-                    i+=2;
-                    continue;
-                }
-            }
-
-            if (!split_condition && !collecting) {
-                if (utf_char.char_type == CNCTCharType::LETTER || (!token.size() && utf_char==" " && utf_char_next.char_type == CNCTCharType::LETTER)) {
-                    collecting_letter = true;
-                    collecting = true;
-                } else if (utf_char.char_type == CNCTCharType::DIGIT || (!token.size() && utf_char==" " && utf_char_next.char_type == CNCTCharType::DIGIT)) {
-                    collecting_numeric = true;
-                    collecting = true;
-                } else if (
-                           ((utf_char.char_type != CNCTCharType::LETTER && utf_char.char_type != CNCTCharType::DIGIT) && (utf_char.char_type != CNCTCharType::WHITESPACE)) ||
-                           (!token.size() && utf_char==" " && utf_char_next.char_type != CNCTCharType::LETTER && utf_char_next.char_type != CNCTCharType::DIGIT && utf_char_next.char_type != CNCTCharType::WHITESPACE)
-                          ) {
-                    collecting_special = true;
-                    collecting = true;
-                } else if (utf_char.char_type == CNCTCharType::WHITESPACE && utf_char_next.char_type == CNCTCharType::WHITESPACE) {
-                    collecting_whitespace_lookahead = true;
-                    collecting = true;
-                } else if (utf_char.char_type == CNCTCharType::WHITESPACE) {
-                    split_condition = true;
-                }
-            } else if (!split_condition && collecting) {
-                if (collecting_letter && utf_char.char_type != CNCTCharType::LETTER) {
-                    split_condition = true;
-                } else if (collecting_numeric && utf_char.char_type != CNCTCharType::DIGIT) {
-                    split_condition = true;
-                } else if (collecting_special && (utf_char.char_type == CNCTCharType::LETTER || utf_char.char_type == CNCTCharType::DIGIT || utf_char.char_type == CNCTCharType::WHITESPACE)) {
-                    split_condition = true;
-                } else if (collecting_whitespace_lookahead && utf_char_next.char_type != CNCTCharType::WHITESPACE) {
-                    split_condition = true;
-                }
-            }
-
-            if(utf_char_next.str.size() == 0) {
-                split_condition = true; // final
-                token += utf_char.str;
-            }
-
-            if (split_condition) {
-                if (token.size()) {
-                    bpe_words.emplace_back(token);
-                }
-                token = utf_char.str;
-                collecting = false;
-                collecting_letter = false;
-                collecting_numeric = false;
-                collecting_special = false;
-                collecting_whitespace_lookahead = false;
-            } else {
-                token += utf_char.str;
-            }
-        }
-
-        for (std::string& word : bpe_words) {
-            std::string encoded_token="";
-            for (char& c : word) {
-                encoded_token += byte_encoder[c];
-            }
-            bpe_encoded_words.emplace_back(encoded_token);
-        }
-
-        return bpe_encoded_words;
-    }
-
-    // decoder (for one token)
-    std::string decode_token(const std::string& token) {
-        static std::unordered_map< std::string, unsigned char> byte_decoder = unicode_to_bytes();
-        std::string decoded_token="";
-        auto unicode_seqeunces = CNCTUnicode::split_utf8(token);
-        for (auto& unicode_sequence : unicode_seqeunces) {
-            decoded_token += byte_decoder[unicode_sequence];
-        }
-
-        return decoded_token;
-    }
-
-    const gpt2bpe_vocab & vocab_;
-    std::vector<ggllm_bpe_symbol> symbols_;
-    std::vector<ggllm_bpe_symbol> symbols_final;
-    ggllm_bpe_bigram::queue work_queue_;
-    bool flag_g2ws=false;
-};
-
-static std::vector<gpt2bpe_vocab::id> gpt2bpe_tokenize(const gpt2bpe_vocab & vocab, const std::string & text, bool bos, bool g2ws ) {
-    gpt2bpe_tokenizer tokenizer(vocab, g2ws);
-    std::vector<gpt2bpe_vocab::id> output;
-
-    if (text.empty()) {
-        return output;
-    }
-
-    if (bos && vocab.special_bos_id != -1) {
-        output.push_back(vocab.special_bos_id);
-    }
-
-    tokenizer.tokenize(text, output);
-    return output;
-}
-
-#endif // CMPNCT_GPT2BPE
diff --git a/examples/gptneox-wip/falcon-main.cpp b/examples/gptneox-wip/falcon-main.cpp
deleted file mode 100644
index e9197f6b5..000000000
--- a/examples/gptneox-wip/falcon-main.cpp
+++ /dev/null
@@ -1,1111 +0,0 @@
-#include "ggml.h"
-#include "cmpnct_gpt2bpe.hpp"
-
-#include <cassert>
-#include <cmath>
-#include <cstdio>
-#include <cstring>
-#include <cinttypes>
-#include <fstream>
-#include <map>
-#include <string>
-#include <vector>
-#include <thread>
-#include <random>
-
-#if defined(_MSC_VER)
-#pragma warning(disable: 4244 4267) // possible loss of data
-#endif
-
-// default hparams
-struct falcon_hparams {
-    size_t n_merges = 0;
-    size_t n_vocab  = 0;
-    uint32_t n_ctx    = 0;
-    uint32_t n_embd   = 0;
-    uint32_t n_head   = 0;
-    uint32_t n_head_kv = 1; // Needs to be 1 for 7B model
-    uint32_t n_ff = 0;
-    uint32_t n_block  = 0;
-    float norm_eps = 1e-5;
-};
-struct falcon_block {
-    // normalization
-    struct ggml_tensor* input_layernorm;
-    struct ggml_tensor* input_layernorm_b;
-    struct ggml_tensor* attention_norm;    // Falcon-40B only
-    struct ggml_tensor* attention_norm_b;  // Falcon-40B only
-
-    // attention
-    struct ggml_tensor* query_key_value;
-    struct ggml_tensor* wo;
-
-    // ff
-    struct ggml_tensor* ffn_up;
-    struct ggml_tensor* ffn_down;
-};
-
-struct falcon_model {
-    falcon_hparams hparams;
-
-    struct ggml_tensor* tok_embeddings;
-    struct ggml_tensor* output_norm;
-    struct ggml_tensor* output_norm_b;
-    struct ggml_tensor* lm_head;
-
-    std::vector<falcon_block> blocks;
-
-    // key + value memory
-    struct ggml_tensor* memory_k;
-    struct ggml_tensor* memory_v;
-
-    struct gguf_context * ggufctx;
-    struct ggml_context * ctx;
-    struct ggml_context * kvctx;
-
-    std::map<std::string, struct ggml_tensor*> tensors;
-};
-
-struct gpt_params {
-    int32_t seed      = -1;  // RNG seed
-    int32_t n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency());
-    uint32_t n_predict = 200; // new tokens to predict
-    uint32_t n_batch   = 512;   // batch size for prompt processing
-
-    // sampling parameters
-    int32_t top_k          = 40;
-    float top_p            = 1.0f;
-    float temp             = 0.8f;
-    int32_t repeat_last_n  = 64;
-    float repeat_penalty   = 1.02f;
-
-    std::string model      = ""; // model path
-    std::string prompt     = "";
-
-    std::string token_test = "";
-    bool    interactive      = false;
-    int32_t interactive_port = -1;
-    int32_t n_gpu_layers     = 0;
-};
-
-void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
-    fprintf(stderr, "usage: %s [options]\n", argv[0]);
-    fprintf(stderr, "\n");
-    fprintf(stderr, "options:\n");
-    fprintf(stderr, "  -h, --help            show this help message and exit\n");
-    fprintf(stderr, "  -s SEED, --seed SEED  RNG seed (default: -1)\n");
-    fprintf(stderr, "  -t N, --threads N     number of threads to use during computation (default: %d)\n", params.n_threads);
-    fprintf(stderr, "  -ngl N, --gpu-layers N  number of layers to offload to GPU on supported models (default: %d)\n", params.n_gpu_layers);
-    fprintf(stderr, "  -p PROMPT, --prompt PROMPT\n");
-    fprintf(stderr, "                        prompt to start generation with (default: random)\n");
-    fprintf(stderr, "  -f FNAME, --file FNAME\n");
-    fprintf(stderr, "                        load prompt from a file\n");
-    fprintf(stderr, "  -tt TOKEN_TEST, --token_test TOKEN_TEST\n");
-    fprintf(stderr, "                        test tokenization\n");
-    fprintf(stderr, "  -n N, --n_predict N   number of tokens to predict (default: %d)\n", params.n_predict);
-    fprintf(stderr, "  --top_k N             top-k sampling, 0 = n_vocab (default: %d)\n", params.top_k);
-    fprintf(stderr, "  --top_p N             top-p sampling (default: %.1f)\n", params.top_p);
-    fprintf(stderr, "  --temp N              temperature (default: %.1f)\n", params.temp);
-    fprintf(stderr, "  --repeat-last-n N     last n tokens to consider for penalize (default: %d, 0 = disabled)\n", params.repeat_last_n);
-    fprintf(stderr, "  --repeat-penalty N    penalize repeat sequence of tokens (default: %.2f, 1.0 = disabled)\n", (double)params.repeat_penalty);
-    fprintf(stderr, "  -b N, --batch_size N  batch size for prompt processing (default: %d)\n", params.n_batch);
-    fprintf(stderr, "  -m FNAME, --model FNAME\n");
-    fprintf(stderr, "                        model path (default: %s)\n", params.model.c_str());
-    fprintf(stderr, "\n");
-}
-
-// Function to check if the next argument exists
-std::string get_next_arg(int& i, int argc, char** argv, const std::string& flag, gpt_params& params) {
-    if (i + 1 < argc && argv[i + 1][0] != '-') {
-        return argv[++i];
-    } else {
-        fprintf(stderr, "error: %s requires one argument.\n", flag.c_str());
-        gpt_print_usage(argc, argv, params);
-        exit(0);
-    }
-}
-
-bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
-    for (int i = 1; i < argc; i++) {
-        std::string arg = argv[i];
-
-        if (arg == "-s" || arg == "--seed") {
-            params.seed = std::stoi(get_next_arg(i, argc, argv, arg, params));
-        } else if (arg == "-t" || arg == "--threads") {
-            params.n_threads = std::stoi(get_next_arg(i, argc, argv, arg, params));
-        } else if (arg == "-ngl" || arg == "--gpu-layers" || arg == "--n-gpu-layers") {
-            params.n_gpu_layers = std::stoi(get_next_arg(i, argc, argv, arg, params));
-        } else if (arg == "-p" || arg == "--prompt") {
-            params.prompt = get_next_arg(i, argc, argv, arg, params);
-        } else if (arg == "-n" || arg == "--n_predict") {
-            params.n_predict = std::stoi(get_next_arg(i, argc, argv, arg, params));
-        } else if (arg == "--top_k") {
-            params.top_k = std::stoi(get_next_arg(i, argc, argv, arg, params));
-        } else if (arg == "--top_p") {
-            params.top_p = std::stof(get_next_arg(i, argc, argv, arg, params));
-        } else if (arg == "--temp") {
-            params.temp = std::stof(get_next_arg(i, argc, argv, arg, params));
-        } else if (arg == "--repeat-last-n") {
-            params.repeat_last_n = std::stoi(get_next_arg(i, argc, argv, arg, params));
-        } else if (arg == "--repeat-penalty") {
-            params.repeat_penalty = std::stof(get_next_arg(i, argc, argv, arg, params));
-        } else if (arg == "-b" || arg == "--batch_size") {
-            params.n_batch= std::stoi(get_next_arg(i, argc, argv, arg, params));
-        } else if (arg == "-m" || arg == "--model") {
-            params.model = get_next_arg(i, argc, argv, arg, params);
-        } else if (arg == "-i" || arg == "--interactive") {
-            params.interactive = true;
-        } else if (arg == "-ip" || arg == "--interactive-port") {
-            params.interactive = true;
-            params.interactive_port = std::stoi(get_next_arg(i, argc, argv, arg, params));
-        } else if (arg == "-h" || arg == "--help") {
-            gpt_print_usage(argc, argv, params);
-            exit(0);
-        } else if (arg == "-f" || arg == "--file") {
-            get_next_arg(i, argc, argv, arg, params);
-            std::ifstream file(argv[i]);
-            if (!file) {
-                fprintf(stderr, "error: failed to open file '%s'\n", argv[i]);
-                break;
-            }
-            std::copy(std::istreambuf_iterator<char>(file), std::istreambuf_iterator<char>(), back_inserter(params.prompt));
-            if (params.prompt.back() == '\n') {
-                params.prompt.pop_back();
-            }
-        } else if (arg == "-tt" || arg == "--token_test") {
-            params.token_test = get_next_arg(i, argc, argv, arg, params);
-        }
-        else {
-            fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
-            gpt_print_usage(argc, argv, params);
-            exit(0);
-        }
-    }
-
-    return true;
-}
-
-gpt2bpe_vocab::id sample_top_k_top_p_repeat(
-        const gpt2bpe_vocab & vocab,
-        const float * logits,
-        const int32_t * last_n_tokens_data,
-        size_t last_n_tokens_data_size,
-        int    top_k,
-        double top_p,
-        double temp,
-        int repeat_last_n,
-        float repeat_penalty,
-        std::mt19937 & rng) {
-
-    int n_logits = vocab.id_to_token.size();
-
-    const auto * plogits = logits;
-
-    const auto last_n_tokens = std::vector<int32_t>(last_n_tokens_data, last_n_tokens_data + last_n_tokens_data_size);
-
-    if (temp <= 0) {
-        // select the token with the highest logit directly
-        float max_logit = plogits[0];
-        gpt2bpe_vocab::id max_id = 0;
-
-        for (int i = 1; i < n_logits; ++i) {
-            if (plogits[i] > max_logit) {
-                max_logit = plogits[i];
-                max_id = i;
-            }
-        }
-        return max_id;
-    }
-
-
-    std::vector<std::pair<double, gpt2bpe_vocab::id>> logits_id;
-    logits_id.reserve(n_logits);
-
-    {
-        const float scale = 1.0f/temp;
-        for (int i = 0; i < n_logits; ++i) {
-            // repetition penalty from ctrl paper (https://arxiv.org/abs/1909.05858)
-            // credit https://github.com/facebookresearch/llama/compare/main...shawwn:llama:main
-            if (repeat_last_n > 0 && std::find(last_n_tokens.end()-repeat_last_n, last_n_tokens.end(), i) != last_n_tokens.end()) {
-                // if score < 0 then repetition penalty has to multiplied to reduce the previous token probability
-                if (plogits[i] < 0.0f) {
-                    logits_id.push_back(std::make_pair(plogits[i]*scale*repeat_penalty, i));
-                } else {
-                    logits_id.push_back(std::make_pair(plogits[i]*scale/repeat_penalty, i));
-                }
-            } else {
-                logits_id.push_back(std::make_pair(plogits[i]*scale, i));
-            }
-        }
-    }
-
-    // find the top K tokens
-    std::partial_sort(
-            logits_id.begin(),
-            logits_id.begin() + top_k, logits_id.end(),
-            [](const std::pair<double, gpt2bpe_vocab::id> & a, const std::pair<double, gpt2bpe_vocab::id> & b) {
-        return a.first > b.first;
-    });
-
-    logits_id.resize(top_k);
-
-    double maxl = -INFINITY;
-    for (const auto & kv : logits_id) {
-        maxl = std::max(maxl, kv.first);
-    }
-
-    // compute probs for the top K tokens
-    std::vector<double> probs;
-    probs.reserve(logits_id.size());
-
-    double sum = 0.0;
-    for (const auto & kv : logits_id) {
-        double p = exp(kv.first - maxl);
-        probs.push_back(p);
-        sum += p;
-    }
-
-    // normalize the probs
-    for (auto & p : probs) {
-        p /= sum;
-    }
-
-    if (top_p < 1.0f) {
-        double cumsum = 0.0f;
-        for (int i = 0; i < top_k; i++) {
-            cumsum += probs[i];
-            if (cumsum >= top_p) {
-                top_k = i + 1;
-                probs.resize(top_k);
-                logits_id.resize(top_k);
-                break;
-            }
-        }
-
-        cumsum = 1.0/cumsum;
-        for (int i = 0; i < (int) probs.size(); i++) {
-            probs[i] *= cumsum;
-        }
-    }
-
-//    printf("\n");
-//    for (int i = 0; i < (int) probs.size(); i++) {
-//    for (int i = 0; i < 10; i++) {
-//        printf("%d: '%s' %f\n", i, vocab.id_to_token.at(logits_id[i].second).c_str(), probs[i]);
-//    }
-
-    std::discrete_distribution<> dist(probs.begin(), probs.end());
-    int idx = dist(rng);
-
-    return logits_id[idx].second;
-
-}
-
-struct ggml_tensor * get_tensor_ex( struct ggml_context * ctx, std::string name){
-
-    struct ggml_tensor * cur = ggml_get_tensor(ctx, name.c_str());
-    if( cur == NULL ) {
-        printf("%s: tensor '%s' not found!\n", __func__, name.c_str());
-    } else {
-//        printf("%s: n_dims = %d, name = '%s'\n", __func__, cur->n_dims, cur->name);
-    }
-
-    return cur;
-}
-
-// load the model's weights from a file
-bool falcon_model_load(const std::string & fname, falcon_model & model, gpt2bpe_vocab & vocab) {
-    printf("%s: loading model from '%s'..\n", __func__, fname.c_str());
-
-    model.ctx = NULL;
-
-    struct gguf_init_params ggufparams = {
-        /*.no_alloc = */ false,
-        /*.ctx      = */ &model.ctx,
-    };
-
-    auto & ggufctx = model.ggufctx;
-
-    ggufctx  = gguf_init_from_file(fname.c_str(), ggufparams);
-
-    if (!ggufctx) {
-        fprintf(stderr, "%s: gguf_init_from_file() failed\n", __func__);
-        return false;
-    }
-
-    printf("%s: gguf version     = %d\n", __func__, gguf_get_version(ggufctx));
-    printf("%s: gguf alignment   = %zu\n", __func__, gguf_get_alignment(ggufctx));
-    printf("%s: gguf data offset = %zu\n", __func__, gguf_get_data_offset(ggufctx));
-
-    // print all kv
-    #if 0
-    {
-        const int n_kv = gguf_get_n_kv(ggufctx);
-
-        printf("%s: n_kv: %d\n", __func__, n_kv);
-
-        for (int i = 0; i < n_kv; ++i) {
-            const char * key = gguf_get_key(ggufctx, i);
-
-            printf("%s: kv[%d]: key = %s\n", __func__, i, key);
-        }
-    }
-    #endif
-
-    // print some standard metadata
-    {
-        int keyidx;
-
-        keyidx = gguf_find_key(ggufctx, "general.name");
-        if (keyidx != -1) { printf("%s: model name           = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
-        keyidx = gguf_find_key(ggufctx, "general.description");
-        if (keyidx != -1) { printf("%s: model description    = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
-        keyidx = gguf_find_key(ggufctx, "general.author");
-        if (keyidx != -1) { printf("%s: model author         = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
-        keyidx = gguf_find_key(ggufctx, "general.license");
-        if (keyidx != -1) { printf("%s: model license        = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
-        keyidx = gguf_find_key(ggufctx, "general.architecture");
-        if (keyidx != -1) { printf("%s: model architecture   = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
-        keyidx = gguf_find_key(ggufctx, "general.file_type");
-        if (keyidx != -1) { printf("%s: model file type      = %" PRIu32 "\n", __func__, gguf_get_val_u32(ggufctx, keyidx)); }
-        keyidx = gguf_find_key(ggufctx, "gptneox.tensor_data_layout");
-        if (keyidx != -1) { printf("%s: model data layout    = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
-        keyidx = gguf_find_key(ggufctx, "general.source.huggingface.repository");
-        if (keyidx != -1) { printf("%s: model source HF repo = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
-    }
-
-    // check required metadata
-    {
-        int keyidx;
-
-        // check model architecture kv
-        keyidx = gguf_find_key(ggufctx, "general.architecture");
-        if (keyidx != -1) {
-            if ( strcmp(gguf_get_val_str(ggufctx, keyidx), "falcon") != 0) {
-                printf("%s: model architecture not supported!\n", __func__);
-                return false;
-            }
-        } else {
-            printf("%s: gguf model architecture not found!\n", __func__);
-            return false;
-        }
-
-        // check model tensor data layout kv
-        keyidx = gguf_find_key(ggufctx, "falcon.tensor_data_layout");
-        if (keyidx != -1) {
-            if ( strcmp(gguf_get_val_str(ggufctx, keyidx), "jploski") != 0) {
-                printf("%s: model tensor data layout not supported!\n", __func__);
-                return false;
-            }
-        } else {
-            printf("%s: gguf model tensor data layout not found!\n", __func__);
-            return false;
-        }
-
-    }
-
-    // load hparams
-    {
-        auto & hparams = model.hparams;
-
-        bool ok = true;
-        int keyidx;
-
-        if (ok) { keyidx = gguf_find_key(ggufctx, "falcon.context_length");
-                  if (keyidx != -1) { hparams.n_ctx = gguf_get_val_u32(ggufctx, keyidx); } else { ok = false; }  }
-
-        if (ok) { keyidx = gguf_find_key(ggufctx, "falcon.embedding_length");
-                  if (keyidx != -1) { hparams.n_embd = gguf_get_val_u32(ggufctx, keyidx); } else { ok = false; }  }
-
-        if (ok) { keyidx = gguf_find_key(ggufctx, "falcon.attention.head_count");
-                  if (keyidx != -1) { hparams.n_head = gguf_get_val_u32(ggufctx, keyidx); } else { ok = false; }  }
-
-        if (ok) { keyidx = gguf_find_key(ggufctx, "falcon.feed_forward_length");
-                  if (keyidx != -1) { hparams.n_ff = gguf_get_val_u32(ggufctx, keyidx); } else { ok = false; }  }
-
-        if (ok) { keyidx = gguf_find_key(ggufctx, "falcon.block_count");
-                  if (keyidx != -1) { hparams.n_block = gguf_get_val_u32(ggufctx, keyidx); } else { ok = false; }  }
-
-        if (ok) { keyidx = gguf_find_key(ggufctx, "falcon.attention.layer_norm_epsilon");
-                  if (keyidx != -1) { hparams.norm_eps= gguf_get_val_f32(ggufctx, keyidx); } else { ok = false; }  }
-
-        if (!ok) {
-            fprintf(stderr, "%s: required hparam missing!\n", __func__);
-            return false;
-        }
-
-        keyidx = gguf_find_key(ggufctx, "falcon.attention.head_count_kv");
-        if (keyidx != -1) { hparams.n_head_kv = gguf_get_val_u32(ggufctx, keyidx); }
-
-
-        printf("%s: n_ctx      = %d\n", __func__, hparams.n_ctx);
-        printf("%s: n_embd     = %d\n", __func__, hparams.n_embd);
-        printf("%s: n_head     = %d\n", __func__, hparams.n_head);
-        printf("%s: n_head_kv  = %d\n", __func__, hparams.n_head_kv);
-        printf("%s: n_block    = %d\n", __func__, hparams.n_block);
-        printf("%s: norm_eps   = %g\n", __func__, hparams.norm_eps);
-
-    }
-
-    // load vocab
-    {
-        auto & hparams = model.hparams;
-
-        int keyidx = gguf_find_key(ggufctx, "tokenizer.ggml.model");
-
-        if (keyidx != -1) {
-            if ( strcmp(gguf_get_val_str(ggufctx, keyidx), "gpt2") != 0) {
-                printf("%s: tokenizer model not supported!\n", __func__);
-                return false;
-            }
-        } else {
-            printf("%s: tokenizer model not found!\n", __func__);
-            return false;
-        }
-
-
-        int tokens_keyidx = gguf_find_key(ggufctx, "tokenizer.ggml.tokens");
-
-        if (tokens_keyidx == -1) {
-            printf("%s: gpt2 tokenizer vocab not found!\n", __func__);
-            return false;
-        }
-
-        int merges_keyidx = gguf_find_key(ggufctx, "tokenizer.ggml.merges");
-
-        if (merges_keyidx == -1) {
-            printf("%s: gpt2 tokenizer merges not found!\n", __func__);
-            return false;
-        }
-
-        hparams.n_vocab = gguf_get_arr_n(ggufctx,tokens_keyidx);
-        hparams.n_merges = gguf_get_arr_n(ggufctx,merges_keyidx);
-
-        printf("%s: gpt2 tokenizer vocab  = %zu\n", __func__, hparams.n_vocab);
-        printf("%s: gpt2 tokenizer merges = %zu\n", __func__, hparams.n_merges);
-
-        for (size_t i = 0; i < hparams.n_vocab; i++) {
-            std::string word = gguf_get_arr_str(ggufctx, tokens_keyidx, i);
-
-//            printf("token %d = '%s'\n",i,word.c_str() );
-
-            vocab.token_to_id[word] = i;
-            vocab.id_to_token[i] = word;
-
-            if( vocab.id_to_token[i] == "\n" ) {
-                vocab.linefeed_id = i;
-            }
-        }
-
-        std::vector<std::pair<std::string, std::string>> bpe_merges;
-
-        for (size_t i = 0; i < hparams.n_merges; i++) {
-
-            std::string word = gguf_get_arr_str(ggufctx, merges_keyidx, i);
-
-            // Split the merges
-            std::string first, second;
-            size_t pos = word.find(' ', 1); // Start the search from the second character
-            if (pos != std::string::npos) {
-                first = word.substr(0, pos);
-                second = word.substr(pos + 1);
-            }
-
-            bpe_merges.push_back(std::make_pair(first, second));
-        }
-
-        vocab.populate_bpe_ranks(bpe_merges);
-
-
-        keyidx = gguf_find_key(ggufctx, "tokenizer.ggml.bos_token_id"); if( keyidx != -1 ) {       vocab.special_bos_id = (int32_t)gguf_get_val_u32(ggufctx, keyidx); }
-        keyidx = gguf_find_key(ggufctx, "tokenizer.ggml.eos_token_id"); if( keyidx != -1 ) {       vocab.special_eos_id = (int32_t)gguf_get_val_u32(ggufctx, keyidx); }
-        keyidx = gguf_find_key(ggufctx, "tokenizer.ggml.unknown_token_id"); if( keyidx != -1 ) {   vocab.special_unk_id = (int32_t)gguf_get_val_u32(ggufctx, keyidx); }
-        keyidx = gguf_find_key(ggufctx, "tokenizer.ggml.separator_token_id"); if( keyidx != -1 ) { vocab.special_sep_id = (int32_t)gguf_get_val_u32(ggufctx, keyidx); }
-        keyidx = gguf_find_key(ggufctx, "tokenizer.ggml.padding_token_id"); if( keyidx != -1 ) {   vocab.special_pad_id = (int32_t)gguf_get_val_u32(ggufctx, keyidx); }
-
-        if( vocab.special_bos_id != -1 ) { printf("%s: BOS token = %d '%s'\n", __func__, vocab.special_bos_id, vocab.id_to_token[vocab.special_bos_id].c_str() ); }
-        if( vocab.special_eos_id != -1 ) { printf("%s: EOS token = %d '%s'\n", __func__, vocab.special_eos_id, vocab.id_to_token[vocab.special_eos_id].c_str() ); }
-        if( vocab.special_unk_id != -1 ) { printf("%s: UNK token = %d '%s'\n", __func__, vocab.special_unk_id, vocab.id_to_token[vocab.special_unk_id].c_str() ); }
-        if( vocab.special_sep_id != -1 ) { printf("%s: SEP token = %d '%s'\n", __func__, vocab.special_sep_id, vocab.id_to_token[vocab.special_sep_id].c_str() ); }
-        if( vocab.special_pad_id != -1 ) { printf("%s: PAD token = %d '%s'\n", __func__, vocab.special_pad_id, vocab.id_to_token[vocab.special_pad_id].c_str() ); }
-        if( vocab.linefeed_id    != -1 ) { printf("%s: LF token  = %d\n",      __func__, vocab.linefeed_id ); }
-
-    }
-
-
-    auto & ctx = model.ctx;
-    size_t ctx_size = ggml_get_mem_size(ctx);
-
-    printf("%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size/(1024.0*1024.0));
-
-    // print tensor info
-    #if 0
-    {
-        const int n_tensors = gguf_get_n_tensors(ggufctx);
-
-        printf("%s: n_tensors: %d\n", __func__, n_tensors);
-
-        for (int i = 0; i < n_tensors; ++i) {
-            const char * name   = gguf_get_tensor_name  (ggufctx, i);
-            const size_t offset = gguf_get_tensor_offset(ggufctx, i);
-
-            printf("%s: tensor[%d]: name = %s, offset = %zu\n", __func__, i, name, offset);
-        }
-    }
-    #endif
-
-    // prepare memory for the weights
-    {
-
-        auto & hparams = model.hparams;
-
-        const int n_block = hparams.n_block;
-
-        model.blocks.resize(n_block);
-
-        model.tok_embeddings = ggml_get_tensor(ctx, "token_embd.weight");
-
-        model.output_norm = ggml_get_tensor(ctx, "output_norm.weight");
-        model.output_norm_b = ggml_get_tensor(ctx, "output_norm.bias");
-        model.lm_head = ggml_get_tensor(ctx, "output.weight");
-
-        // map by name
-        model.tensors["token_embd.weight"] = model.tok_embeddings;
-        model.tensors["output_norm.weight"] = model.output_norm;
-        model.tensors["output_norm.bias"] = model.output_norm_b;
-        model.tensors["output.weight"] = model.lm_head;
-
-        for (int i = 0; i < n_block; ++i) {
-
-            auto& block = model.blocks[i];
-            std::string blocknamestart = "blk." + std::to_string(i) + ".";
-
-            block.input_layernorm   =  get_tensor_ex(ctx, blocknamestart + "attn_norm.weight" );
-            block.input_layernorm_b =  get_tensor_ex(ctx, blocknamestart + "attn_norm.bias" );
-
-            if ( hparams.n_head_kv == 8 ) { // Falcon-40B
-                block.attention_norm   =  get_tensor_ex(ctx, blocknamestart + "attn_norm_2.weight" );
-                block.attention_norm_b =  get_tensor_ex(ctx, blocknamestart + "attn_norm_2.bias" );
-            }
-
-            // query_key_value shape for config.multi_query == True:
-            block.query_key_value = get_tensor_ex(ctx, blocknamestart + "attn_qkv.weight" );
-            block.wo = get_tensor_ex(ctx, blocknamestart + "attn_output.weight" );
-
-            block.ffn_up = get_tensor_ex(ctx, blocknamestart + "ffn_up.weight" );
-            block.ffn_down = get_tensor_ex(ctx, blocknamestart + "ffn_down.weight" );
-
-            // map by name
-            if ( hparams.n_head_kv == 8 ) { // Falcon-40B
-                // Falcon-40B:
-                model.tensors[blocknamestart + "attn_norm.weight"] = block.input_layernorm;
-                model.tensors[blocknamestart + "attn_norm.bias"] = block.input_layernorm_b;
-                model.tensors[blocknamestart + "attn_norm_2.weight"] = block.attention_norm;
-                model.tensors[blocknamestart + "attn_norm_2.bias"] = block.attention_norm_b;
-            } else {
-                // Falcon-7B:
-                model.tensors[blocknamestart + "attn_norm.weight"] = block.input_layernorm;
-                model.tensors[blocknamestart + "attn_norm.bias"] = block.input_layernorm_b;
-            }
-
-            model.tensors[blocknamestart + "attn_qkv.weight"] = block.query_key_value;
-            model.tensors[blocknamestart + "attn_output.weight"] = block.wo;
-
-            model.tensors[blocknamestart + "ffn_up.weight"] = block.ffn_up;
-            model.tensors[blocknamestart + "ffn_down.weight"] = block.ffn_down;
-        }
-    }
-
-    // key + value memory
-    {
-        const auto & kvctx = model.kvctx;
-        const auto & hparams = model.hparams;
-
-        const int n_block = hparams.n_block;
-        const int n_ctx   = hparams.n_ctx;
-        const int n_embd = hparams.n_embd;
-
-        const int64_t n_mem      = n_block*n_ctx;
-        const int64_t n_elements = n_embd*n_mem;
-
-        // create the ggml context
-        {
-            struct ggml_init_params params = {
-                /*.mem_size   =*/ size_t(n_elements*4+ggml_tensor_overhead()*2),
-                /*.mem_buffer =*/ NULL,
-                /*.no_alloc   =*/ false,
-            };
-
-            model.kvctx = ggml_init(params);
-            if (!model.kvctx) {
-                fprintf(stderr, "%s: kv ggml_init() failed\n", __func__);
-                return false;
-            }
-
-        }
-
-
-        model.memory_k = ggml_new_tensor_1d(kvctx, GGML_TYPE_F16, n_elements);
-        model.memory_v = ggml_new_tensor_1d(kvctx, GGML_TYPE_F16, n_elements);
-
-        const size_t memory_size = ggml_nbytes(model.memory_k) + ggml_nbytes(model.memory_v);
-
-        printf("%s: memory_size = %8.2f MB, n_mem = %" PRId64 "\n", __func__, memory_size/1024.0/1024.0, n_mem);
-    }
-
-    return true;
-}
-
-
-// evaluate the transformer
-//
-//   - model:     the model
-//   - n_threads: number of threads to use
-//   - n_past:    the context size so far
-//   - embd_inp:  the embeddings of the tokens in the context
-//   - embd_w:    the predicted logits for the next token
-//
-bool falcon_eval(
-        const falcon_model & model,
-        const int n_threads,
-        const int n_past,
-        const std::vector<gpt2bpe_vocab::id> & embd_inp,
-              std::vector<float>         & embd_w,
-              size_t                     & mem_per_token) {
-
-
-    const int N = embd_inp.size();
-
-    const auto & hparams = model.hparams;
-
-    const int n_embd  = hparams.n_embd;
-    const int n_block = hparams.n_block;
-    const int n_ctx   = hparams.n_ctx;
-    const int n_head  = hparams.n_head;
-    const int n_head_kv = hparams.n_head_kv;
-    const int n_vocab = hparams.n_vocab;
-    const size_t head_dim = n_embd / n_head;
-
-    static size_t buf_size = 256u*1024*1024;
-    static void * buf = malloc(buf_size);
-
-    // use 2 scratch buffers
-    // TODO: very hacky solution - reimplement in a more elegant way
-    static size_t scr0_size = 256u*1024*1024;
-    static void * scr0 = malloc(scr0_size);
-
-    static size_t scr1_size = 256u*1024*1024;
-    static void * scr1 = malloc(scr1_size);
-
-    if (mem_per_token > 0 && mem_per_token*N > buf_size) {
-        const size_t buf_size_new = 1.1*(mem_per_token*N); // add 10% to account for ggml object overhead
-        //printf("\n%s: reallocating buffer from %zu to %zu bytes\n", __func__, buf_size, buf_size_new);
-
-        // reallocate
-        buf_size = buf_size_new;
-        buf = realloc(buf, buf_size);
-        if (buf == nullptr) {
-            fprintf(stderr, "%s: failed to allocate %zu bytes\n", __func__, buf_size);
-            return false;
-        }
-    }
-
-    struct ggml_init_params params = {
-        /*.mem_size   =*/ buf_size,
-        /*.mem_buffer =*/ buf,
-        /*.no_alloc   =*/ false,
-    };
-
-    struct ggml_context * ctx0 = ggml_init(params);
-    struct ggml_cgraph gf = {};
-//    gf.n_threads = n_threads;
-
-    struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
-    memcpy(embd->data, embd_inp.data(), N*ggml_element_size(embd));
-
-    // wte
-    struct ggml_tensor * inpL = ggml_get_rows(ctx0, model.tok_embeddings, embd);
-//    struct ggml_tensor* repeat_dummy = ggml_new_tensor_3d(ctx0, inpL->type, head_dim, N + n_past, n_head);
-
-    ggml_type wtype = GGML_TYPE_F32;
-    const int sizeof_wtype = ggml_type_sizef(wtype);
-
-    for (int il = 0; il < n_block; ++il) {
-        struct ggml_tensor * cur;
-        struct ggml_tensor * layernorm_output;
-
-        ggml_set_scratch(ctx0, { 0, scr0_size, scr0, });
-
-        // self-attention
-        {
-            layernorm_output = ggml_norm(ctx0, inpL);
-
-            layernorm_output = ggml_add(ctx0,
-                    ggml_mul(ctx0,
-                        ggml_repeat(ctx0, model.blocks[il].input_layernorm, layernorm_output),
-                        layernorm_output),
-                    ggml_repeat(ctx0, model.blocks[il].input_layernorm_b, layernorm_output));
-
-            if ( hparams.n_head_kv == 8 ) { // Falcon-40B
-                cur = ggml_norm(ctx0, inpL);
-
-                cur = ggml_add(ctx0,
-                        ggml_mul(ctx0,
-                            ggml_repeat(ctx0, model.blocks[il].attention_norm, cur),
-                            cur),
-                        ggml_repeat(ctx0, model.blocks[il].attention_norm_b, cur));
-            }
-            else { // Falcon 7B
-                cur = layernorm_output;
-            }
-
-            // compute QKV
-
-            cur = ggml_mul_mat(ctx0, model.blocks[il].query_key_value, cur);
-
-            // Note that the strides for Kcur, Vcur are set up so that the
-            // resulting views are misaligned with the tensor's storage
-            // (by applying the K/V offset we shift the tensor's original
-            // view to stick out behind the viewed QKV tensor's allocated
-            // memory, so to say). This is ok because no actual accesses
-            // happen to that out-of-range memory, but it can require some
-            // trickery when trying to accurately dump these views for
-            // debugging.
-
-            struct ggml_tensor * Qcur = ggml_view_3d(
-                ctx0, cur, head_dim, n_head, N,
-                head_dim * sizeof_wtype,
-                head_dim * (n_head + 2 * n_head_kv) * sizeof_wtype,
-                0);
-
-            struct ggml_tensor * Kcur = ggml_view_3d(
-                ctx0, cur, head_dim, n_head_kv, N,
-                head_dim * sizeof_wtype,
-                head_dim * (n_head + 2 * n_head_kv) * sizeof_wtype,
-                head_dim * n_head * sizeof_wtype);
-
-            struct ggml_tensor * Vcur = ggml_view_3d(
-                ctx0, cur, head_dim, n_head_kv, N,
-                head_dim * sizeof_wtype,
-                head_dim * (n_head + 2 * n_head_kv) * sizeof_wtype,
-                head_dim * (n_head + n_head_kv) * sizeof_wtype);
-
-            // using mode = 2 for neox mode
-            Qcur = ggml_rope_inplace(ctx0, Qcur, n_past, head_dim, 2, 0);
-            Kcur = ggml_rope_inplace(ctx0, Kcur, n_past, head_dim, 2, 0);
-
-            // store key and value to memory
-            {
-                struct ggml_tensor* k = ggml_view_1d(
-                    ctx0, model.memory_k, N * n_head_kv * head_dim,
-                    (ggml_element_size(model.memory_k) * n_head_kv * head_dim) *
-                        (il * n_ctx + n_past));
-                struct ggml_tensor* v = ggml_view_1d(
-                    ctx0, model.memory_v, N * n_head_kv * head_dim,
-                    (ggml_element_size(model.memory_v) * n_head_kv * head_dim) *
-                        (il * n_ctx + n_past));
-
-                ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Kcur, k));
-                ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Vcur, v));
-            }
-
-            struct ggml_tensor * K = ggml_permute(
-                ctx0,
-                ggml_reshape_3d(
-                    ctx0,
-                    ggml_view_1d(ctx0, model.memory_k, (n_past + N) * n_head_kv * head_dim,
-                                 il * n_ctx *
-                                     ggml_element_size(model.memory_k) *
-                                     n_head_kv *
-                                     head_dim),
-                    head_dim, n_head_kv, n_past + N),
-                0, 2, 1, 3);
-
-            // K * Q
-
-//            K = ggml_cont(ctx0, ggml_repeat2(ctx0, K, repeat_dummy));
-
-            struct ggml_tensor * Q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3);
-            struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
-
-            // KQ_scaled = KQ / sqrt(n_embd/n_head)
-            struct ggml_tensor * KQ_scaled =
-                ggml_scale_inplace(ctx0,
-                        KQ,
-                        ggml_new_f32(ctx0, 1.0f/sqrt(float(head_dim)))
-                        );
-
-            // KQ_masked = mask_past(KQ_scaled)
-            struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past);
-
-            // KQ = soft_max(KQ_masked)
-            struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);
-
-            // V_trans = Vmem.view(n_embd/n_head, n_head, n_past + N).permute(1, 2, 0, 3).contiguous()
-            struct ggml_tensor* V = ggml_permute(
-                ctx0,
-                ggml_reshape_3d(
-                    ctx0,
-                    ggml_view_1d(ctx0, model.memory_v, (n_past + N) * n_head_kv * head_dim,
-                                 il * n_ctx *
-                                     ggml_element_size(model.memory_v) *
-                                     n_head_kv *
-                                     head_dim),
-                    head_dim, n_head_kv, n_past + N),
-                0, 2, 1, 3);
-
-//            V = ggml_cont(ctx0, ggml_transpose(ctx0, ggml_repeat2(ctx0, V, repeat_dummy)));
-            V = ggml_cont(ctx0, ggml_transpose(ctx0, V));
-
-            // KQV = transpose(V) * KQ_soft_max
-            struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
-
-            // KQV_merged = KQV.permute(0, 2, 1, 3)
-            struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
-
-            // cur = KQV_merged.contiguous().view(n_embd, N)
-            cur = ggml_cpy(ctx0,
-                    KQV_merged,
-                    ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
-
-            // projection
-            {
-                cur = ggml_mul_mat(ctx0,
-                        model.blocks[il].wo,
-                        cur);
-            }
-        }
-
-        ggml_set_scratch(ctx0, { 0, scr1_size, scr1, });
-
-        struct ggml_tensor* inpFF = layernorm_output;
-        struct ggml_tensor* attn_out = ggml_cpy(
-            ctx0, cur, ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
-
-        {
-            cur = ggml_mul_mat(ctx0, model.blocks[il].ffn_up, inpFF);
-            cur = ggml_gelu(ctx0, cur);
-            cur = ggml_mul_mat(ctx0, model.blocks[il].ffn_down, cur);
-        }
-
-        cur = ggml_add(ctx0, cur, attn_out);
-        cur = ggml_add(ctx0, cur, inpL);
-        // input for next layer
-        inpL = cur;
-    }
-
-    ggml_set_scratch(ctx0, { 0, scr0_size, scr0, });
-
-    // norm
-    {
-        inpL = ggml_norm(ctx0, inpL);
-
-        // inpL = ln_f_g*inpL + ln_f_b
-        inpL = ggml_add(ctx0,
-                ggml_mul(ctx0,
-                    ggml_repeat(ctx0, model.output_norm, inpL),
-                    inpL),
-                ggml_repeat(ctx0, model.output_norm_b, inpL));
-    }
-
-    ggml_set_scratch(ctx0, { 0, 0, nullptr, });
-
-    // lm_head
-    {
-        inpL = ggml_mul_mat(ctx0, model.lm_head, inpL);
-
-        //inpL = ggml_add(ctx0,
-        //        ggml_repeat(ctx0, model.lmh_b, inpL),
-        //        inpL);
-    }
-
-    // logits -> probs
-    //inpL = ggml_soft_max_inplace(ctx0, inpL);
-
-    // run the computation
-    ggml_build_forward_expand(&gf, inpL);
-//    ggml_graph_compute       (ctx0, &gf);
-    ggml_graph_compute_with_ctx(ctx0, &gf, n_threads);
-
-    //if (n_past%100 == 0) {
-    //    ggml_graph_print   (&gf);
-    //    ggml_graph_dump_dot(&gf, NULL, "gpt-2.dot");
-    //}
-
-    // return result for just the last token
-    embd_w.resize(n_vocab);
-    memcpy(embd_w.data(), (float *)ggml_get_data(inpL) + (n_vocab * (N - 1)), sizeof(float) * n_vocab);
-
-    if (mem_per_token == 0) {
-        mem_per_token = ggml_used_mem(ctx0)/N;
-    }
-    //printf("used_mem = %zu\n", ggml_used_mem(ctx0));
-
-    ggml_free(ctx0);
-
-    return true;
-}
-
-int main(int argc, char ** argv) {
-    ggml_time_init();
-
-    const int64_t t_main_start_us = ggml_time_us();
-
-    gpt_params params;
-
-    if (!gpt_params_parse(argc, argv, params)) {
-        return 1;
-    }
-
-    int64_t t_load_us = 0;
-
-    gpt2bpe_vocab vocab;
-    falcon_model model;
-
-    // load the model
-    {
-        const int64_t t_start_us = ggml_time_us();
-
-        if (!falcon_model_load(params.model, model, vocab)) {
-            fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, params.model.c_str());
-            return 1;
-        }
-
-        t_load_us = ggml_time_us() - t_start_us;
-
-    }
-
-    if (params.seed < 0) {
-        params.seed = time(NULL);
-    }
-
-    if (params.top_k == 0) {
-        params.top_k = model.hparams.n_vocab;
-    }
-
-    printf("%s: seed           = %d\n",   __func__, params.seed);
-    printf("%s: temp           = %.3f\n", __func__, params.temp);
-    printf("%s: top_k          = %d\n",   __func__, params.top_k);
-    printf("%s: top_p          = %.3f\n", __func__, params.top_p);
-    printf("%s: repeat_last_n  = %d\n",   __func__, params.repeat_last_n);
-    printf("%s: repeat_penalty = %.3f\n", __func__, params.repeat_penalty);
-
-    std::mt19937 rng(params.seed);
-
-    if (params.prompt.empty()) {
-        params.prompt = "Once upon";
-    }
-
-    std::vector<int32_t> last_n_tokens(model.hparams.n_ctx);
-    std::fill(last_n_tokens.begin(), last_n_tokens.end(), 0);
-
-    int n_past = 0;
-
-    int64_t t_sample_us  = 0;
-    int64_t t_predict_us = 0;
-
-    std::vector<float> logits;
-
-    // tokenize the prompt
-    std::vector<gpt2bpe_vocab::id> embd_inp = gpt2bpe_tokenize(vocab, params.prompt,false, false);
-
-    params.n_predict = std::min(params.n_predict, model.hparams.n_ctx - (int) embd_inp.size());
-
-    printf("%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
-//    for (size_t i = 0; i < embd_inp.size(); i++) {
-//        printf("%s: token[%zu] = %6d, %s\n", __func__, i, embd_inp[i], vocab.id_to_token[embd_inp[i]].c_str());
-//    }
-
-    if( model.hparams.n_ctx < params.n_predict+embd_inp.size() ) {
-        params.n_predict = model.hparams.n_ctx-embd_inp.size();
-    }
-
-    printf("%s: n_predict = %d\n", __func__, params.n_predict);
-    printf("\n");
-
-    std::vector<gpt2bpe_vocab::id> embd;
-
-    // determine the required inference memory per token:
-    size_t mem_per_token = 0;
-    falcon_eval(model, params.n_threads, 0, { 0, 1, 2, 3 }, logits, mem_per_token);
-
-    for (size_t i = embd.size(); i < embd_inp.size() + params.n_predict; i++) {
-        // predict
-        if (embd.size() > 0) {
-            const int64_t t_start_us = ggml_time_us();
-
-            if (!falcon_eval(model, params.n_threads, n_past, embd, logits, mem_per_token)) {
-                printf("Failed to predict\n");
-                return 1;
-            }
-
-            t_predict_us += ggml_time_us() - t_start_us;
-        }
-
-        n_past += embd.size();
-        embd.clear();
-
-        if (i >= embd_inp.size()) {
-            // sample next token
-            const int   top_k = params.top_k;
-            const float top_p = params.top_p;
-            const float temp  = params.temp;
-            const int repeat_last_n = params.repeat_last_n;
-            const float repeat_penalty = params.repeat_penalty;
-
-            const int n_vocab = model.hparams.n_vocab;
-
-            gpt2bpe_vocab::id id = 0;
-
-            {
-                const int64_t t_start_sample_us = ggml_time_us();
-
-                id = sample_top_k_top_p_repeat(vocab, logits.data() + (logits.size() - n_vocab), last_n_tokens.data(), last_n_tokens.size(), top_k, top_p, temp, repeat_last_n, repeat_penalty, rng);
-
-                last_n_tokens.erase(last_n_tokens.begin());
-                last_n_tokens.push_back(id);
-
-                t_sample_us += ggml_time_us() - t_start_sample_us;
-            }
-
-            // add it to the context
-            embd.push_back(id);
-        } else {
-            // if here, it means we are still processing the input prompt
-            for (size_t k = i; k < embd_inp.size(); k++) {
-                embd.push_back(embd_inp[k]);
-                if (embd.size() > params.n_batch) {
-                    break;
-                }
-            }
-            i += embd.size() - 1;
-        }
-
-        // display text
-        for (auto id : embd) {
-            printf("%s", vocab.id_to_token[id].c_str()  );
-        }
-        fflush(stdout);
-
-        // end of text token
-        if (vocab.special_eos_id != -1 && embd.back() == vocab.special_eos_id) {
-            break;
-        }
-    }
-
-    // report timing
-    {
-        const int64_t t_main_end_us = ggml_time_us();
-
-        printf("\n\n");
-        printf("%s: mem per token = %8zu bytes\n", __func__, mem_per_token);
-        printf("%s:     load time = %8.2f ms\n", __func__, t_load_us/1000.0f);
-        printf("%s:   sample time = %8.2f ms\n", __func__, t_sample_us/1000.0f);
-        printf("%s:  predict time = %8.2f ms / %.2f ms per token\n", __func__, t_predict_us/1000.0f, t_predict_us/1000.0f/n_past);
-        printf("%s:    total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us)/1000.0f);
-    }
-
-    ggml_free(model.ctx);
-
-    return 0;
-}
diff --git a/examples/gptneox-wip/gptneox-main.cpp b/examples/gptneox-wip/gptneox-main.cpp
deleted file mode 100644
index b76bafaa8..000000000
--- a/examples/gptneox-wip/gptneox-main.cpp
+++ /dev/null
@@ -1,1083 +0,0 @@
-#include "ggml.h"
-#include "cmpnct_gpt2bpe.hpp"
-
-#include <cassert>
-#include <cmath>
-#include <cstdio>
-#include <cstring>
-#include <cinttypes>
-#include <fstream>
-#include <map>
-#include <string>
-#include <vector>
-#include <thread>
-#include <random>
-
-#if defined(_MSC_VER)
-#pragma warning(disable: 4244 4267) // possible loss of data
-#endif
-
-// default hparams
-struct gpt_neox_hparams {
-    size_t n_merges = 0;
-    size_t n_vocab  = 0;
-    uint32_t n_ctx    = 0;
-    uint32_t n_embd   = 0;
-    uint32_t n_head   = 0;
-    uint32_t n_block  = 0;
-    uint32_t n_rot    = 0; // rotary_pct * (n_embd / n_head)
-    bool par_res = true;
-    float norm_eps = 1e-5;
-};
-
-struct gpt_neox_block {
-    // pre normalization
-    struct ggml_tensor * ln_1_g;
-    struct ggml_tensor * ln_1_b;
-
-    // attention
-    struct ggml_tensor * c_attn_attn_w;
-    struct ggml_tensor * c_attn_attn_b;
-
-    struct ggml_tensor * c_attn_proj_w;
-    struct ggml_tensor * c_attn_proj_b;
-
-    // post normalization
-    struct ggml_tensor * ln_2_g;
-    struct ggml_tensor * ln_2_b;
-
-    // ff
-    struct ggml_tensor * c_mlp_fc_w;
-    struct ggml_tensor * c_mlp_fc_b;
-
-    struct ggml_tensor * c_mlp_proj_w;
-    struct ggml_tensor * c_mlp_proj_b;
-};
-
-struct gpt_neox_model {
-    gpt_neox_hparams hparams;
-
-    // normalization
-    struct ggml_tensor * ln_f_g;
-    struct ggml_tensor * ln_f_b;
-
-    struct ggml_tensor * wte; // position embedding
-
-    struct ggml_tensor * lmh_g; // language model head
-
-    std::vector<gpt_neox_block> blocks;
-
-    // key + value memory
-    struct ggml_tensor * memory_k;
-    struct ggml_tensor * memory_v;
-
-    //
-    struct gguf_context * ggufctx;
-    struct ggml_context * ctx;
-    struct ggml_context * kvctx;
-
-    std::map<std::string, struct ggml_tensor *> tensors;
-};
-
-struct gpt_params {
-    int32_t seed      = -1;  // RNG seed
-    int32_t n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency());
-    uint32_t n_predict = 200; // new tokens to predict
-    uint32_t n_batch   = 512;   // batch size for prompt processing
-
-    // sampling parameters
-    int32_t top_k          = 40;
-    float top_p            = 1.0f;
-    float temp             = 0.8f;
-    int32_t repeat_last_n  = 64;
-    float repeat_penalty   = 1.02f;
-
-    std::string model      = ""; // model path
-    std::string prompt     = "";
-
-    std::string token_test = "";
-    bool    interactive      = false;
-    int32_t interactive_port = -1;
-    int32_t n_gpu_layers     = 0;
-};
-
-void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
-    fprintf(stderr, "usage: %s [options]\n", argv[0]);
-    fprintf(stderr, "\n");
-    fprintf(stderr, "options:\n");
-    fprintf(stderr, "  -h, --help            show this help message and exit\n");
-    fprintf(stderr, "  -s SEED, --seed SEED  RNG seed (default: -1)\n");
-    fprintf(stderr, "  -t N, --threads N     number of threads to use during computation (default: %d)\n", params.n_threads);
-    fprintf(stderr, "  -ngl N, --gpu-layers N  number of layers to offload to GPU on supported models (default: %d)\n", params.n_gpu_layers);
-    fprintf(stderr, "  -p PROMPT, --prompt PROMPT\n");
-    fprintf(stderr, "                        prompt to start generation with (default: random)\n");
-    fprintf(stderr, "  -f FNAME, --file FNAME\n");
-    fprintf(stderr, "                        load prompt from a file\n");
-    fprintf(stderr, "  -tt TOKEN_TEST, --token_test TOKEN_TEST\n");
-    fprintf(stderr, "                        test tokenization\n");
-    fprintf(stderr, "  -n N, --n_predict N   number of tokens to predict (default: %d)\n", params.n_predict);
-    fprintf(stderr, "  --top_k N             top-k sampling, 0 = n_vocab (default: %d)\n", params.top_k);
-    fprintf(stderr, "  --top_p N             top-p sampling (default: %.1f)\n", params.top_p);
-    fprintf(stderr, "  --temp N              temperature (default: %.1f)\n", params.temp);
-    fprintf(stderr, "  --repeat-last-n N     last n tokens to consider for penalize (default: %d, 0 = disabled)\n", params.repeat_last_n);
-    fprintf(stderr, "  --repeat-penalty N    penalize repeat sequence of tokens (default: %.2f, 1.0 = disabled)\n", (double)params.repeat_penalty);
-    fprintf(stderr, "  -b N, --batch_size N  batch size for prompt processing (default: %d)\n", params.n_batch);
-    fprintf(stderr, "  -m FNAME, --model FNAME\n");
-    fprintf(stderr, "                        model path (default: %s)\n", params.model.c_str());
-    fprintf(stderr, "\n");
-}
-
-// Function to check if the next argument exists
-std::string get_next_arg(int& i, int argc, char** argv, const std::string& flag, gpt_params& params) {
-    if (i + 1 < argc && argv[i + 1][0] != '-') {
-        return argv[++i];
-    } else {
-        fprintf(stderr, "error: %s requires one argument.\n", flag.c_str());
-        gpt_print_usage(argc, argv, params);
-        exit(0);
-    }
-}
-
-bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
-    for (int i = 1; i < argc; i++) {
-        std::string arg = argv[i];
-
-        if (arg == "-s" || arg == "--seed") {
-            params.seed = std::stoi(get_next_arg(i, argc, argv, arg, params));
-        } else if (arg == "-t" || arg == "--threads") {
-            params.n_threads = std::stoi(get_next_arg(i, argc, argv, arg, params));
-        } else if (arg == "-ngl" || arg == "--gpu-layers" || arg == "--n-gpu-layers") {
-            params.n_gpu_layers = std::stoi(get_next_arg(i, argc, argv, arg, params));
-        } else if (arg == "-p" || arg == "--prompt") {
-            params.prompt = get_next_arg(i, argc, argv, arg, params);
-        } else if (arg == "-n" || arg == "--n_predict") {
-            params.n_predict = std::stoi(get_next_arg(i, argc, argv, arg, params));
-        } else if (arg == "--top_k") {
-            params.top_k = std::stoi(get_next_arg(i, argc, argv, arg, params));
-        } else if (arg == "--top_p") {
-            params.top_p = std::stof(get_next_arg(i, argc, argv, arg, params));
-        } else if (arg == "--temp") {
-            params.temp = std::stof(get_next_arg(i, argc, argv, arg, params));
-        } else if (arg == "--repeat-last-n") {
-            params.repeat_last_n = std::stoi(get_next_arg(i, argc, argv, arg, params));
-        } else if (arg == "--repeat-penalty") {
-            params.repeat_penalty = std::stof(get_next_arg(i, argc, argv, arg, params));
-        } else if (arg == "-b" || arg == "--batch_size") {
-            params.n_batch= std::stoi(get_next_arg(i, argc, argv, arg, params));
-        } else if (arg == "-m" || arg == "--model") {
-            params.model = get_next_arg(i, argc, argv, arg, params);
-        } else if (arg == "-i" || arg == "--interactive") {
-            params.interactive = true;
-        } else if (arg == "-ip" || arg == "--interactive-port") {
-            params.interactive = true;
-            params.interactive_port = std::stoi(get_next_arg(i, argc, argv, arg, params));
-        } else if (arg == "-h" || arg == "--help") {
-            gpt_print_usage(argc, argv, params);
-            exit(0);
-        } else if (arg == "-f" || arg == "--file") {
-            get_next_arg(i, argc, argv, arg, params);
-            std::ifstream file(argv[i]);
-            if (!file) {
-                fprintf(stderr, "error: failed to open file '%s'\n", argv[i]);
-                break;
-            }
-            std::copy(std::istreambuf_iterator<char>(file), std::istreambuf_iterator<char>(), back_inserter(params.prompt));
-            if (params.prompt.back() == '\n') {
-                params.prompt.pop_back();
-            }
-        } else if (arg == "-tt" || arg == "--token_test") {
-            params.token_test = get_next_arg(i, argc, argv, arg, params);
-        }
-        else {
-            fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
-            gpt_print_usage(argc, argv, params);
-            exit(0);
-        }
-    }
-
-    return true;
-}
-
-gpt2bpe_vocab::id sample_top_k_top_p_repeat(
-        const gpt2bpe_vocab & vocab,
-        const float * logits,
-        const int32_t * last_n_tokens_data,
-        size_t last_n_tokens_data_size,
-        int    top_k,
-        double top_p,
-        double temp,
-        int repeat_last_n,
-        float repeat_penalty,
-        std::mt19937 & rng) {
-
-    int n_logits = vocab.id_to_token.size();
-
-    const auto * plogits = logits;
-
-    const auto last_n_tokens = std::vector<int32_t>(last_n_tokens_data, last_n_tokens_data + last_n_tokens_data_size);
-
-    if (temp <= 0) {
-        // select the token with the highest logit directly
-        float max_logit = plogits[0];
-        gpt2bpe_vocab::id max_id = 0;
-
-        for (int i = 1; i < n_logits; ++i) {
-            if (plogits[i] > max_logit) {
-                max_logit = plogits[i];
-                max_id = i;
-            }
-        }
-        return max_id;
-    }
-
-
-    std::vector<std::pair<double, gpt2bpe_vocab::id>> logits_id;
-    logits_id.reserve(n_logits);
-
-    {
-        const float scale = 1.0f/temp;
-        for (int i = 0; i < n_logits; ++i) {
-            // repetition penalty from ctrl paper (https://arxiv.org/abs/1909.05858)
-            // credit https://github.com/facebookresearch/llama/compare/main...shawwn:llama:main
-            if (repeat_last_n > 0 && std::find(last_n_tokens.end()-repeat_last_n, last_n_tokens.end(), i) != last_n_tokens.end()) {
-                // if score < 0 then repetition penalty has to multiplied to reduce the previous token probability
-                if (plogits[i] < 0.0f) {
-                    logits_id.push_back(std::make_pair(plogits[i]*scale*repeat_penalty, i));
-                } else {
-                    logits_id.push_back(std::make_pair(plogits[i]*scale/repeat_penalty, i));
-                }
-            } else {
-                logits_id.push_back(std::make_pair(plogits[i]*scale, i));
-            }
-        }
-    }
-
-    // find the top K tokens
-    std::partial_sort(
-            logits_id.begin(),
-            logits_id.begin() + top_k, logits_id.end(),
-            [](const std::pair<double, gpt2bpe_vocab::id> & a, const std::pair<double, gpt2bpe_vocab::id> & b) {
-        return a.first > b.first;
-    });
-
-    logits_id.resize(top_k);
-
-    double maxl = -INFINITY;
-    for (const auto & kv : logits_id) {
-        maxl = std::max(maxl, kv.first);
-    }
-
-    // compute probs for the top K tokens
-    std::vector<double> probs;
-    probs.reserve(logits_id.size());
-
-    double sum = 0.0;
-    for (const auto & kv : logits_id) {
-        double p = exp(kv.first - maxl);
-        probs.push_back(p);
-        sum += p;
-    }
-
-    // normalize the probs
-    for (auto & p : probs) {
-        p /= sum;
-    }
-
-    if (top_p < 1.0f) {
-        double cumsum = 0.0f;
-        for (int i = 0; i < top_k; i++) {
-            cumsum += probs[i];
-            if (cumsum >= top_p) {
-                top_k = i + 1;
-                probs.resize(top_k);
-                logits_id.resize(top_k);
-                break;
-            }
-        }
-
-        cumsum = 1.0/cumsum;
-        for (int i = 0; i < (int) probs.size(); i++) {
-            probs[i] *= cumsum;
-        }
-    }
-
-//    printf("\n");
-//    for (int i = 0; i < (int) probs.size(); i++) {
-//    for (int i = 0; i < 10; i++) {
-//        printf("%d: '%s' %f\n", i, vocab.id_to_token.at(logits_id[i].second).c_str(), probs[i]);
-//    }
-
-    std::discrete_distribution<> dist(probs.begin(), probs.end());
-    int idx = dist(rng);
-
-    return logits_id[idx].second;
-
-}
-
-struct ggml_tensor * get_tensor_ex( struct ggml_context * ctx, std::string name){
-
-    struct ggml_tensor * cur = ggml_get_tensor(ctx, name.c_str());
-    if( cur == NULL ) {
-        printf("%s: tensor '%s' not found!\n", __func__, name.c_str());
-    } else {
-//        printf("%s: n_dims = %d, name = '%s'\n", __func__, cur->n_dims, cur->name);
-    }
-
-    return cur;
-}
-
-// load the model's weights from a file
-bool gpt_neox_model_load(const std::string & fname, gpt_neox_model & model, gpt2bpe_vocab & vocab) {
-    printf("%s: loading model from '%s'..\n", __func__, fname.c_str());
-
-    model.ctx = NULL;
-
-    struct gguf_init_params ggufparams = {
-        /*.no_alloc = */ false,
-        /*.ctx      = */ &model.ctx,
-    };
-
-    auto & ggufctx = model.ggufctx;
-
-    ggufctx  = gguf_init_from_file(fname.c_str(), ggufparams);
-
-    if (!ggufctx) {
-        fprintf(stderr, "%s: gguf_init_from_file() failed\n", __func__);
-        return false;
-    }
-
-    printf("%s: gguf version     = %d\n", __func__, gguf_get_version(ggufctx));
-    printf("%s: gguf alignment   = %zu\n", __func__, gguf_get_alignment(ggufctx));
-    printf("%s: gguf data offset = %zu\n", __func__, gguf_get_data_offset(ggufctx));
-
-    // print all kv
-    #if 0
-    {
-        const int n_kv = gguf_get_n_kv(ggufctx);
-
-        printf("%s: n_kv: %d\n", __func__, n_kv);
-
-        for (int i = 0; i < n_kv; ++i) {
-            const char * key = gguf_get_key(ggufctx, i);
-
-            printf("%s: kv[%d]: key = %s\n", __func__, i, key);
-        }
-    }
-    #endif
-
-    // print some standard metadata
-    {
-        int keyidx;
-
-        keyidx = gguf_find_key(ggufctx, "general.name");
-        if (keyidx != -1) { printf("%s: model name           = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
-        keyidx = gguf_find_key(ggufctx, "general.description");
-        if (keyidx != -1) { printf("%s: model description    = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
-        keyidx = gguf_find_key(ggufctx, "general.author");
-        if (keyidx != -1) { printf("%s: model author         = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
-        keyidx = gguf_find_key(ggufctx, "general.license");
-        if (keyidx != -1) { printf("%s: model license        = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
-        keyidx = gguf_find_key(ggufctx, "general.architecture");
-        if (keyidx != -1) { printf("%s: model architecture   = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
-        keyidx = gguf_find_key(ggufctx, "general.file_type");
-        if (keyidx != -1) { printf("%s: model file type      = %" PRIu32 "\n", __func__, gguf_get_val_u32(ggufctx, keyidx)); }
-        keyidx = gguf_find_key(ggufctx, "gptneox.tensor_data_layout");
-        if (keyidx != -1) { printf("%s: model data layout    = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
-        keyidx = gguf_find_key(ggufctx, "general.source.huggingface.repository");
-        if (keyidx != -1) { printf("%s: model source HF repo = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
-    }
-
-    // check required metadata
-    {
-        int keyidx;
-
-        // check model architecture kv
-        keyidx = gguf_find_key(ggufctx, "general.architecture");
-        if (keyidx != -1) {
-            if ( strcmp(gguf_get_val_str(ggufctx, keyidx), "gptneox") != 0) {
-                printf("%s: model architecture not supported!\n", __func__);
-                return false;
-            }
-        } else {
-            printf("%s: gguf model architecture not found!\n", __func__);
-            return false;
-        }
-
-    }
-
-    // load hparams
-    {
-        auto & hparams = model.hparams;
-
-        bool ok = true;
-        int keyidx;
-
-        if (ok) { keyidx = gguf_find_key(ggufctx, "gptneox.context_length");
-                  if (keyidx != -1) { hparams.n_ctx = gguf_get_val_u32(ggufctx, keyidx); } else { ok = false; }  }
-
-        if (ok) { keyidx = gguf_find_key(ggufctx, "gptneox.embedding_length");
-                  if (keyidx != -1) { hparams.n_embd = gguf_get_val_u32(ggufctx, keyidx); } else { ok = false; }  }
-
-        if (ok) { keyidx = gguf_find_key(ggufctx, "gptneox.attention.head_count");
-                  if (keyidx != -1) { hparams.n_head = gguf_get_val_u32(ggufctx, keyidx); } else { ok = false; }  }
-
-        if (ok) { keyidx = gguf_find_key(ggufctx, "gptneox.block_count");
-                  if (keyidx != -1) { hparams.n_block = gguf_get_val_u32(ggufctx, keyidx); } else { ok = false; }  }
-
-        if (ok) { keyidx = gguf_find_key(ggufctx, "gptneox.rope.dimension_count");
-                  if (keyidx != -1) { hparams.n_rot = gguf_get_val_u32(ggufctx, keyidx); } else { ok = false; }  }
-
-        if (ok) { keyidx = gguf_find_key(ggufctx, "gptneox.use_parallel_residual");
-                  if (keyidx != -1) { hparams.par_res = gguf_get_val_bool(ggufctx, keyidx); } else { ok = false; }  }
-
-        if (ok) { keyidx = gguf_find_key(ggufctx, "gptneox.attention.layer_norm_epsilon");
-                  if (keyidx != -1) { hparams.norm_eps= gguf_get_val_f32(ggufctx, keyidx); } else { ok = false; }  }
-
-        if (!ok) {
-            fprintf(stderr, "%s: required hparam missing!\n", __func__);
-            return false;
-        }
-
-        printf("%s: n_ctx    = %d\n", __func__, hparams.n_ctx);
-        printf("%s: n_embd   = %d\n", __func__, hparams.n_embd);
-        printf("%s: n_head   = %d\n", __func__, hparams.n_head);
-        printf("%s: n_block  = %d\n", __func__, hparams.n_block);
-        printf("%s: n_rot    = %d\n", __func__, hparams.n_rot);
-        printf("%s: par_res  = %d\n", __func__, hparams.par_res);
-        printf("%s: norm_eps = %g\n", __func__, hparams.norm_eps);
-
-    }
-
-    // load vocab
-    {
-        auto & hparams = model.hparams;
-
-        int keyidx = gguf_find_key(ggufctx, "tokenizer.ggml.model");
-
-        if (keyidx != -1) {
-            if ( strcmp(gguf_get_val_str(ggufctx, keyidx), "gpt2") != 0) {
-                printf("%s: tokenizer model not supported!\n", __func__);
-                return false;
-            }
-        } else {
-            printf("%s: tokenizer model not found!\n", __func__);
-            return false;
-        }
-
-
-        int tokens_keyidx = gguf_find_key(ggufctx, "tokenizer.ggml.tokens");
-
-        if (tokens_keyidx == -1) {
-            printf("%s: gpt2 tokenizer vocab not found!\n", __func__);
-            return false;
-        }
-
-        int merges_keyidx = gguf_find_key(ggufctx, "tokenizer.ggml.merges");
-
-        if (merges_keyidx == -1) {
-            printf("%s: gpt2 tokenizer merges not found!\n", __func__);
-            return false;
-        }
-
-        hparams.n_vocab = gguf_get_arr_n(ggufctx,tokens_keyidx);
-        hparams.n_merges = gguf_get_arr_n(ggufctx,merges_keyidx);
-
-        printf("%s: gpt2 tokenizer vocab  = %zu\n", __func__, hparams.n_vocab);
-        printf("%s: gpt2 tokenizer merges = %zu\n", __func__, hparams.n_merges);
-
-        for (size_t i = 0; i < hparams.n_vocab; i++) {
-            std::string word = gguf_get_arr_str(ggufctx, tokens_keyidx, i);
-
-//            printf("token %d = '%s'\n",i,word.c_str() );
-
-            vocab.token_to_id[word] = i;
-            vocab.id_to_token[i] = word;
-
-            if( vocab.id_to_token[i] == "\n" ) {
-                vocab.linefeed_id = i;
-            }
-        }
-
-        std::vector<std::pair<std::string, std::string>> bpe_merges;
-
-        for (size_t i = 0; i < hparams.n_merges; i++) {
-
-            std::string word = gguf_get_arr_str(ggufctx, merges_keyidx, i);
-
-            // Split the merges
-            std::string first, second;
-            size_t pos = word.find(' ', 1); // Start the search from the second character
-            if (pos != std::string::npos) {
-                first = word.substr(0, pos);
-                second = word.substr(pos + 1);
-            }
-
-            bpe_merges.push_back(std::make_pair(first, second));
-        }
-
-        vocab.populate_bpe_ranks(bpe_merges);
-
-
-        keyidx = gguf_find_key(ggufctx, "tokenizer.ggml.bos_token_id"); if( keyidx != -1 ) {       vocab.special_bos_id = (int32_t)gguf_get_val_u32(ggufctx, keyidx); }
-        keyidx = gguf_find_key(ggufctx, "tokenizer.ggml.eos_token_id"); if( keyidx != -1 ) {       vocab.special_eos_id = (int32_t)gguf_get_val_u32(ggufctx, keyidx); }
-        keyidx = gguf_find_key(ggufctx, "tokenizer.ggml.unknown_token_id"); if( keyidx != -1 ) {   vocab.special_unk_id = (int32_t)gguf_get_val_u32(ggufctx, keyidx); }
-        keyidx = gguf_find_key(ggufctx, "tokenizer.ggml.separator_token_id"); if( keyidx != -1 ) { vocab.special_sep_id = (int32_t)gguf_get_val_u32(ggufctx, keyidx); }
-        keyidx = gguf_find_key(ggufctx, "tokenizer.ggml.padding_token_id"); if( keyidx != -1 ) {   vocab.special_pad_id = (int32_t)gguf_get_val_u32(ggufctx, keyidx); }
-
-        if( vocab.special_bos_id != -1 ) { printf("%s: BOS token = %d '%s'\n", __func__, vocab.special_bos_id, vocab.id_to_token[vocab.special_bos_id].c_str() ); }
-        if( vocab.special_eos_id != -1 ) { printf("%s: EOS token = %d '%s'\n", __func__, vocab.special_eos_id, vocab.id_to_token[vocab.special_eos_id].c_str() ); }
-        if( vocab.special_unk_id != -1 ) { printf("%s: UNK token = %d '%s'\n", __func__, vocab.special_unk_id, vocab.id_to_token[vocab.special_unk_id].c_str() ); }
-        if( vocab.special_sep_id != -1 ) { printf("%s: SEP token = %d '%s'\n", __func__, vocab.special_sep_id, vocab.id_to_token[vocab.special_sep_id].c_str() ); }
-        if( vocab.special_pad_id != -1 ) { printf("%s: PAD token = %d '%s'\n", __func__, vocab.special_pad_id, vocab.id_to_token[vocab.special_pad_id].c_str() ); }
-        if( vocab.linefeed_id    != -1 ) { printf("%s: LF token  = %d\n",      __func__, vocab.linefeed_id ); }
-    }
-
-
-    auto & ctx = model.ctx;
-    size_t ctx_size = ggml_get_mem_size(ctx);
-
-    printf("%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size/(1024.0*1024.0));
-
-    // print tensor info
-    #if 0
-    {
-        const int n_tensors = gguf_get_n_tensors(ggufctx);
-
-        printf("%s: n_tensors: %d\n", __func__, n_tensors);
-
-        for (int i = 0; i < n_tensors; ++i) {
-            const char * name   = gguf_get_tensor_name  (ggufctx, i);
-            const size_t offset = gguf_get_tensor_offset(ggufctx, i);
-
-            printf("%s: tensor[%d]: name = %s, offset = %zu\n", __func__, i, name, offset);
-        }
-    }
-    #endif
-
-    // prepare memory for the weights
-    {
-        const int n_block = model.hparams.n_block;
-
-        model.blocks.resize(n_block);
-
-        model.wte    = ggml_get_tensor(ctx, "token_embd.weight");
-        model.ln_f_g = ggml_get_tensor(ctx, "output_norm.weight");
-        model.ln_f_b = ggml_get_tensor(ctx, "output_norm.bias");
-        model.lmh_g  = ggml_get_tensor(ctx, "output.weight");
-
-        // map by name
-        model.tensors["token_embd.weight"] = model.wte;
-        model.tensors["output_norm.weight"] = model.ln_f_g;
-        model.tensors["output_norm.bias"]   = model.ln_f_b;
-        model.tensors["output.weight"] = model.lmh_g;
-
-        for (int i = 0; i < n_block; ++i) {
-            auto & block = model.blocks[i];
-
-            std::string blocknamestart = "blk." + std::to_string(i) + ".";
-
-            block.ln_1_g          = get_tensor_ex(ctx, blocknamestart + "attn_norm.weight" );
-            block.ln_1_b          = get_tensor_ex(ctx, blocknamestart + "attn_norm.bias" );
-
-            block.c_attn_attn_w   = get_tensor_ex(ctx, blocknamestart + "attn_qkv.weight" );
-            block.c_attn_attn_b   = get_tensor_ex(ctx ,blocknamestart + "attn_qkv.bias" );
-
-            block.c_attn_proj_w   = get_tensor_ex(ctx, blocknamestart + "attn_output.weight" );
-            block.c_attn_proj_b   = get_tensor_ex(ctx, blocknamestart + "attn_output.bias" );
-
-            block.ln_2_g          = get_tensor_ex(ctx, blocknamestart + "ffn_norm.weight" );
-            block.ln_2_b          = get_tensor_ex(ctx, blocknamestart + "ffn_norm.bias");
-
-            block.c_mlp_fc_w      = get_tensor_ex(ctx, blocknamestart + "ffn_up.weight" );
-            block.c_mlp_fc_b      = get_tensor_ex(ctx, blocknamestart + "ffn_up.bias" );
-
-            block.c_mlp_proj_w    = get_tensor_ex(ctx, blocknamestart + "ffn_down.weight" );
-            block.c_mlp_proj_b    = get_tensor_ex(ctx, blocknamestart + "ffn_down.bias" );
-
-            // map by name
-            model.tensors[blocknamestart + "attn_norm.weight"] = block.ln_1_g;
-            model.tensors[blocknamestart + "attn_norm.bias"]   = block.ln_1_b;
-
-            model.tensors[blocknamestart + "attn_qkv.weight"] = block.c_attn_attn_w;
-            model.tensors[blocknamestart + "attn_qkv.bias"]   = block.c_attn_attn_b;
-
-            model.tensors[blocknamestart + "attn_output.weight"] = block.c_attn_proj_w;
-            model.tensors[blocknamestart + "attn_output.bias"]   = block.c_attn_proj_b;
-
-            model.tensors[blocknamestart + "ffn_norm.weight"] = block.ln_2_g;
-            model.tensors[blocknamestart + "ffn_norm.bias"]   = block.ln_2_b;
-
-            model.tensors[blocknamestart + "ffn_up.weight"] = block.c_mlp_fc_w;
-            model.tensors[blocknamestart + "ffn_up.bias"]   = block.c_mlp_fc_b;
-
-            model.tensors[blocknamestart + "ffn_down.weight"] = block.c_mlp_proj_w;
-            model.tensors[blocknamestart + "ffn_down.bias"]   = block.c_mlp_proj_b;
-        }
-    }
-
-    // key + value memory
-    {
-        const auto & kvctx = model.kvctx;
-        const auto & hparams = model.hparams;
-
-        const int n_embd  = hparams.n_embd;
-        const int n_block = hparams.n_block;
-        const int n_ctx   = hparams.n_ctx;
-
-        const int64_t n_mem      = n_block*n_ctx;
-        const int64_t n_elements = n_embd*n_mem;
-
-        // create the ggml context
-        {
-            struct ggml_init_params params = {
-                /*.mem_size   =*/ size_t(n_elements*4+ggml_tensor_overhead()*2),
-                /*.mem_buffer =*/ NULL,
-                /*.no_alloc   =*/ false,
-            };
-
-            model.kvctx = ggml_init(params);
-            if (!model.kvctx) {
-                fprintf(stderr, "%s: kv ggml_init() failed\n", __func__);
-                return false;
-            }
-
-        }
-
-
-        model.memory_k = ggml_new_tensor_1d(kvctx, GGML_TYPE_F16, n_elements);
-        model.memory_v = ggml_new_tensor_1d(kvctx, GGML_TYPE_F16, n_elements);
-
-        const size_t memory_size = ggml_nbytes(model.memory_k) + ggml_nbytes(model.memory_v);
-
-        printf("%s: memory_size = %8.2f MB, n_mem = %" PRId64 "\n", __func__, memory_size/1024.0/1024.0, n_mem);
-    }
-
-    return true;
-}
-
-
-// feed-forward network
-ggml_tensor * gpt_neox_ff(
-        const gpt_neox_block &block,
-        ggml_context * ctx0,
-        ggml_tensor * inp,
-        const gpt_neox_hparams &hparams) {
-
-    ggml_tensor * cur = ggml_norm(ctx0, inp, hparams.norm_eps);
-
-    cur = ggml_add(ctx0, ggml_mul(ctx0, ggml_repeat(ctx0, block.ln_2_g, cur), cur), ggml_repeat(ctx0, block.ln_2_b, cur));
-    cur = ggml_mul_mat(ctx0, block.c_mlp_fc_w, cur);
-    cur = ggml_add(ctx0, ggml_repeat(ctx0, block.c_mlp_fc_b, cur), cur);
-
-    // GELU activation
-    cur = ggml_gelu(ctx0, cur);
-
-    // projection
-    // cur = proj_w*cur + proj_b
-    cur = ggml_mul_mat(ctx0, block.c_mlp_proj_w, cur);
-
-    cur = ggml_add(ctx0, ggml_repeat(ctx0, block.c_mlp_proj_b, cur), cur);
-    return cur;
-}
-
-// evaluate the transformer
-//
-//   - model:     the model
-//   - n_threads: number of threads to use
-//   - n_past:    the context size so far
-//   - embd_inp:  the embeddings of the tokens in the context
-//   - embd_w:    the predicted logits for the next token
-//
-bool gpt_neox_eval(
-        const gpt_neox_model & model,
-        const int n_threads,
-        const int n_past,
-        const std::vector<gpt2bpe_vocab::id> & embd_inp,
-              std::vector<float>         & embd_w,
-              size_t                     & mem_per_token) {
-    const int N = embd_inp.size();
-
-    const auto & hparams = model.hparams;
-
-    const int n_embd  = hparams.n_embd;
-    const int n_block = hparams.n_block;
-    const int n_ctx   = hparams.n_ctx;
-    const int n_head  = hparams.n_head;
-    const int n_vocab = hparams.n_vocab;
-    const int n_rot   = hparams.n_rot;
-
-    static size_t buf_size = 256u*1024*1024;
-    static void * buf = malloc(buf_size);
-
-    // use 2 scratch buffers
-    // TODO: very hacky solution - reimplement in a more elegant way
-    static size_t scr0_size = 256u*1024*1024;
-    static void * scr0 = malloc(scr0_size);
-
-    static size_t scr1_size = 256u*1024*1024;
-    static void * scr1 = malloc(scr1_size);
-
-    if (mem_per_token > 0 && mem_per_token*N > buf_size) {
-        const size_t buf_size_new = 1.1*(mem_per_token*N); // add 10% to account for ggml object overhead
-        //printf("\n%s: reallocating buffer from %zu to %zu bytes\n", __func__, buf_size, buf_size_new);
-
-        // reallocate
-        buf_size = buf_size_new;
-        buf = realloc(buf, buf_size);
-        if (buf == nullptr) {
-            fprintf(stderr, "%s: failed to allocate %zu bytes\n", __func__, buf_size);
-            return false;
-        }
-    }
-
-    struct ggml_init_params params = {
-        /*.mem_size   =*/ buf_size,
-        /*.mem_buffer =*/ buf,
-        /*.no_alloc   =*/ false,
-    };
-
-    struct ggml_context * ctx0 = ggml_init(params);
-    struct ggml_cgraph gf = {};
-
-    struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
-    memcpy(embd->data, embd_inp.data(), N*ggml_element_size(embd));
-
-
-    // wte
-    struct ggml_tensor * inpL = ggml_get_rows(ctx0, model.wte, embd);
-
-    for (int il = 0; il < n_block; ++il) {
-        struct ggml_tensor * cur;
-
-        ggml_set_scratch(ctx0, { 0, scr0_size, scr0, });
-
-        // self-attention
-        {
-            {
-                cur = ggml_norm(ctx0, inpL, hparams.norm_eps);
-
-                cur = ggml_add(ctx0,
-                        ggml_mul(ctx0, ggml_repeat(ctx0, model.blocks[il].ln_1_g, cur), cur),
-                        ggml_repeat(ctx0, model.blocks[il].ln_1_b, cur));
-            }
-
-            // compute QKV
-            {
-
-                cur = ggml_mul_mat(ctx0, model.blocks[il].c_attn_attn_w, cur);
-                cur = ggml_add(ctx0, ggml_repeat(ctx0, model.blocks[il].c_attn_attn_b, cur), cur);
-            }
-
-            struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_3d(ctx0, cur, n_embd/n_head, n_head, N, cur->nb[1]/n_head, cur->nb[1], 0*sizeof(float)*n_embd/n_head));
-            struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_3d(ctx0, cur, n_embd/n_head, n_head, N, cur->nb[1]/n_head, cur->nb[1], 1*sizeof(float)*n_embd/n_head));
-            struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_3d(ctx0, cur, n_embd/n_head, n_head, N, cur->nb[1]/n_head, cur->nb[1], 2*sizeof(float)*n_embd/n_head));
-
-            // using mode = 2 for GPT-NeoX mode
-            Qcur = ggml_rope_inplace(ctx0, Qcur, n_past, n_rot, 2, 0);
-            Kcur = ggml_rope_inplace(ctx0, Kcur, n_past, n_rot, 2, 0);
-
-            // store key and value to memory
-            {
-                Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, Vcur, n_embd, N));
-
-                struct ggml_tensor * k = ggml_view_1d(ctx0, model.memory_k, N*n_embd, (ggml_element_size(model.memory_k)*n_embd)*(il*n_ctx + n_past));
-                struct ggml_tensor * v = ggml_view_2d(ctx0, model.memory_v, N, n_embd,
-                        (   n_ctx)*ggml_element_size(model.memory_v),
-                        (il*n_ctx)*ggml_element_size(model.memory_v)*n_embd + n_past*ggml_element_size(model.memory_v));
-
-                ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Kcur, k));
-                ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Vcur, v));
-            }
-
-            // Q = Qcur.contiguous().view(n_embd/n_head, n_head, N).permute(0, 2, 1, 3)
-            struct ggml_tensor * Q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3);
-
-            // K = Kmem.view(n_embd/n_head, n_head, n_past + N).permute(0, 2, 1, 3)
-            struct ggml_tensor * K =
-                ggml_permute(ctx0,
-                        ggml_reshape_3d(ctx0,
-                            ggml_view_1d(ctx0, model.memory_k, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_k)*n_embd),
-                            n_embd/n_head, n_head, n_past + N),
-                        0, 2, 1, 3);
-
-            // K * Q
-            struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
-
-            // KQ_scaled = KQ / sqrt(n_embd/n_head)
-            struct ggml_tensor * KQ_scaled =
-                ggml_scale_inplace(ctx0,
-                        KQ,
-                        ggml_new_f32(ctx0, 1.0f/sqrt(float(n_embd)/n_head))
-                        );
-
-            // KQ_masked = mask_past(KQ_scaled)
-            struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past);
-
-            // KQ = soft_max(KQ_masked)
-            struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);
-
-            // V_trans = Vmem.view(n_embd/n_head, n_head, n_past + N).permute(1, 2, 0, 3).contiguous()
-            struct ggml_tensor * V =
-                ggml_view_3d(ctx0, model.memory_v,
-                        n_past + N, n_embd/n_head, n_head,
-                        n_ctx*ggml_element_size(model.memory_v),
-                        n_ctx*ggml_element_size(model.memory_v)*n_embd/n_head,
-                        il*n_ctx*ggml_element_size(model.memory_v)*n_embd);
-
-            // KQV = transpose(V) * KQ_soft_max
-            struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
-
-            // KQV_merged = KQV.permute(0, 2, 1, 3)
-            struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
-
-            // cur = KQV_merged.contiguous().view(n_embd, N)
-            cur = ggml_cpy(ctx0, KQV_merged, ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
-
-            // projection
-            {
-                cur = ggml_mul_mat(ctx0, model.blocks[il].c_attn_proj_w, cur);
-                cur = ggml_add(ctx0, ggml_repeat(ctx0, model.blocks[il].c_attn_proj_b, cur), cur);
-            }
-        }
-
-        ggml_set_scratch(ctx0, { 0, scr1_size, scr1, });
-
-        if (hparams.par_res == 0) {
-            struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpL);
-
-            cur = gpt_neox_ff(model.blocks[il], ctx0, inpFF, hparams);
-
-            // input for next layer
-            inpL = ggml_add(ctx0, cur, inpFF);
-        } else {
-            struct ggml_tensor * inpFF = cur;
-
-            // this is independent of the self-attention result, so it could be done in parallel to the self-attention
-            // note here we pass inpL instead of cur
-            cur = gpt_neox_ff(model.blocks[il], ctx0, inpL, hparams);
-
-            // layer input + FF
-            cur  = ggml_add(ctx0, cur, inpFF);
-
-            // input for next layer
-            inpL = ggml_add(ctx0, cur, inpL);
-        }
-    }
-
-    ggml_set_scratch(ctx0, { 0, scr0_size, scr0, });
-
-    // norm
-    {
-        inpL = ggml_norm(ctx0, inpL, hparams.norm_eps);
-
-        // inpL = ln_f_g*inpL + ln_f_b
-        inpL = ggml_add(ctx0,
-                ggml_mul(ctx0,
-                    ggml_repeat(ctx0, model.ln_f_g, inpL),
-                    inpL),
-                ggml_repeat(ctx0, model.ln_f_b, inpL));
-    }
-
-    ggml_set_scratch(ctx0, { 0, 0, nullptr, });
-
-    // lm_head
-    {
-        inpL = ggml_mul_mat(ctx0, model.lmh_g, inpL);
-
-        //inpL = ggml_add(ctx0,
-        //        ggml_repeat(ctx0, model.lmh_b, inpL),
-        //        inpL);
-    }
-
-    // logits -> probs
-    //inpL = ggml_soft_max_inplace(ctx0, inpL);
-
-    // run the computation
-    ggml_build_forward_expand(&gf, inpL);
-    ggml_graph_compute_with_ctx(ctx0, &gf, n_threads);
-
-    //if (n_past%100 == 0) {
-    //    ggml_graph_print   (&gf);
-    //    ggml_graph_dump_dot(&gf, NULL, "gpt-2.dot");
-    //}
-
-    //embd_w.resize(n_vocab*N);
-    //memcpy(embd_w.data(), ggml_get_data(inpL), sizeof(float)*n_vocab*N);
-
-    // return result for just the last token
-    embd_w.resize(n_vocab);
-    memcpy(embd_w.data(), (float *) ggml_get_data(inpL) + (n_vocab*(N-1)), sizeof(float)*n_vocab);
-
-    if (mem_per_token == 0) {
-        mem_per_token = ggml_used_mem(ctx0)/N;
-    }
-    //printf("used_mem = %zu\n", ggml_used_mem(ctx0));
-
-    ggml_free(ctx0);
-
-    return true;
-}
-
-int main(int argc, char ** argv) {
-    ggml_time_init();
-
-    const int64_t t_main_start_us = ggml_time_us();
-
-    gpt_params params;
-
-    if (!gpt_params_parse(argc, argv, params)) {
-        return 1;
-    }
-
-    int64_t t_load_us = 0;
-
-    gpt2bpe_vocab vocab;
-    gpt_neox_model model;
-
-    // load the model
-    {
-        const int64_t t_start_us = ggml_time_us();
-
-        if (!gpt_neox_model_load(params.model, model, vocab)) {
-            fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, params.model.c_str());
-            return 1;
-        }
-
-        t_load_us = ggml_time_us() - t_start_us;
-
-    }
-
-    if (params.seed < 0) {
-        params.seed = time(NULL);
-    }
-
-    if (params.top_k == 0) {
-        params.top_k = model.hparams.n_vocab;
-    }
-
-    printf("%s: seed           = %d\n",   __func__, params.seed);
-    printf("%s: temp           = %.3f\n", __func__, params.temp);
-    printf("%s: top_k          = %d\n",   __func__, params.top_k);
-    printf("%s: top_p          = %.3f\n", __func__, params.top_p);
-    printf("%s: repeat_last_n  = %d\n",   __func__, params.repeat_last_n);
-    printf("%s: repeat_penalty = %.3f\n", __func__, params.repeat_penalty);
-
-    std::mt19937 rng(params.seed);
-
-    if (params.prompt.empty()) {
-        params.prompt = "Once upon";
-    }
-
-    std::vector<int32_t> last_n_tokens(model.hparams.n_ctx);
-    std::fill(last_n_tokens.begin(), last_n_tokens.end(), 0);
-
-    int n_past = 0;
-
-    int64_t t_sample_us  = 0;
-    int64_t t_predict_us = 0;
-
-    std::vector<float> logits;
-
-    // tokenize the prompt
-    std::vector<gpt2bpe_vocab::id> embd_inp = gpt2bpe_tokenize(vocab, params.prompt,false, false);
-
-    params.n_predict = std::min(params.n_predict, model.hparams.n_ctx - (int) embd_inp.size());
-
-    printf("%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
-//    for (size_t i = 0; i < embd_inp.size(); i++) {
-//        printf("%s: token[%zu] = %6d, %s\n", __func__, i, embd_inp[i], vocab.id_to_token[embd_inp[i]].c_str());
-//    }
-
-    if( model.hparams.n_ctx < params.n_predict+embd_inp.size() ) {
-        params.n_predict = model.hparams.n_ctx-embd_inp.size();
-    }
-
-    printf("%s: n_predict = %d\n", __func__, params.n_predict);
-    printf("\n");
-
-    std::vector<gpt2bpe_vocab::id> embd;
-
-    // determine the required inference memory per token:
-    size_t mem_per_token = 0;
-    gpt_neox_eval(model, params.n_threads, 0, { 0, 1, 2, 3 }, logits, mem_per_token);
-
-    for (size_t i = embd.size(); i < embd_inp.size() + params.n_predict; i++) {
-        // predict
-        if (embd.size() > 0) {
-            const int64_t t_start_us = ggml_time_us();
-
-            if (!gpt_neox_eval(model, params.n_threads, n_past, embd, logits, mem_per_token)) {
-                printf("Failed to predict\n");
-                return 1;
-            }
-
-            t_predict_us += ggml_time_us() - t_start_us;
-        }
-
-        n_past += embd.size();
-        embd.clear();
-
-        if (i >= embd_inp.size()) {
-            // sample next token
-            const int   top_k = params.top_k;
-            const float top_p = params.top_p;
-            const float temp  = params.temp;
-            const int repeat_last_n = params.repeat_last_n;
-            const float repeat_penalty = params.repeat_penalty;
-
-            const int n_vocab = model.hparams.n_vocab;
-
-            gpt2bpe_vocab::id id = 0;
-
-            {
-                const int64_t t_start_sample_us = ggml_time_us();
-
-                id = sample_top_k_top_p_repeat(vocab, logits.data() + (logits.size() - n_vocab), last_n_tokens.data(), last_n_tokens.size(), top_k, top_p, temp, repeat_last_n, repeat_penalty, rng);
-
-                last_n_tokens.erase(last_n_tokens.begin());
-                last_n_tokens.push_back(id);
-
-                t_sample_us += ggml_time_us() - t_start_sample_us;
-            }
-
-            // add it to the context
-            embd.push_back(id);
-        } else {
-            // if here, it means we are still processing the input prompt
-            for (size_t k = i; k < embd_inp.size(); k++) {
-                embd.push_back(embd_inp[k]);
-                if (embd.size() > params.n_batch) {
-                    break;
-                }
-            }
-            i += embd.size() - 1;
-        }
-
-        // display text
-        for (auto id : embd) {
-            printf("%s", vocab.id_to_token[id].c_str()  );
-        }
-        fflush(stdout);
-
-        // end of text token
-        if (vocab.special_eos_id != -1 && embd.back() == vocab.special_eos_id) {
-            break;
-        }
-    }
-
-    // report timing
-    {
-        const int64_t t_main_end_us = ggml_time_us();
-
-        printf("\n\n");
-        printf("%s: mem per token = %8zu bytes\n", __func__, mem_per_token);
-        printf("%s:     load time = %8.2f ms\n", __func__, t_load_us/1000.0f);
-        printf("%s:   sample time = %8.2f ms\n", __func__, t_sample_us/1000.0f);
-        printf("%s:  predict time = %8.2f ms / %.2f ms per token\n", __func__, t_predict_us/1000.0f, t_predict_us/1000.0f/n_past);
-        printf("%s:    total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us)/1000.0f);
-    }
-
-    ggml_free(model.ctx);
-
-    return 0;
-}
diff --git a/examples/infill/CMakeLists.txt b/examples/infill/CMakeLists.txt
index 046f9b1e7..e4e8028da 100644
--- a/examples/infill/CMakeLists.txt
+++ b/examples/infill/CMakeLists.txt
@@ -3,6 +3,3 @@ add_executable(${TARGET} infill.cpp)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
-if(TARGET BUILD_INFO)
-  add_dependencies(${TARGET} BUILD_INFO)
-endif()
diff --git a/examples/infill/infill.cpp b/examples/infill/infill.cpp
index 187623f5d..62f5ce3c1 100644
--- a/examples/infill/infill.cpp
+++ b/examples/infill/infill.cpp
@@ -2,7 +2,6 @@
 
 #include "console.h"
 #include "llama.h"
-#include "build-info.h"
 #include "grammar-parser.h"
 
 #include <cassert>
@@ -39,8 +38,8 @@ static gpt_params               * g_params;
 static std::vector<llama_token> * g_input_tokens;
 static std::ostringstream       * g_output_ss;
 static std::vector<llama_token> * g_output_tokens;
-static bool is_interacting = false;
 
+static bool is_interacting = false;
 
 static void write_logfile(
     const llama_context * ctx, const gpt_params & params, const llama_model * model,
@@ -104,7 +103,7 @@ static void sigint_handler(int signo) {
 
 int main(int argc, char ** argv) {
     gpt_params params;
-    llama_sampling_params & sparams = params.sampling_params;
+    llama_sampling_params & sparams = params.sparams;
     g_params = &params;
 
     if (!gpt_params_parse(argc, argv, params)) {
@@ -184,8 +183,8 @@ int main(int argc, char ** argv) {
         LOG_TEE("%s: warning: scaling RoPE frequency by %g.\n", __func__, params.rope_freq_scale);
     }
 
-    LOG_TEE("%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT);
-    LOG_TEE("%s: built with %s for %s\n", __func__, BUILD_COMPILER, BUILD_TARGET);
+    LOG_TEE("%s: build = %d (%s)\n",      __func__, LLAMA_BUILD_NUMBER, LLAMA_COMMIT);
+    LOG_TEE("%s: built with %s for %s\n", __func__, LLAMA_COMPILER, LLAMA_BUILD_TARGET);
 
     if (params.seed == LLAMA_DEFAULT_SEED) {
         params.seed = time(NULL);
@@ -246,23 +245,23 @@ int main(int argc, char ** argv) {
     if (suff_rm_leading_spc && inp_sfx[0] == space_token) {
         inp_sfx.erase(inp_sfx.begin());
     }
-    inp_pfx.insert(inp_pfx.begin(), llama_token_prefix(ctx));
+    inp_pfx.insert(inp_pfx.begin(), llama_token_prefix(model));
     if (add_bos) {
-        inp_pfx.insert(inp_pfx.begin(), llama_token_bos(ctx));
+        inp_pfx.insert(inp_pfx.begin(), llama_token_bos(model));
     }
-    inp_sfx.insert(inp_sfx.begin(), llama_token_suffix(ctx));
+    inp_sfx.insert(inp_sfx.begin(), llama_token_suffix(model));
     embd_inp = inp_pfx;
     embd_inp.insert(embd_inp.end(), inp_sfx.begin(), inp_sfx.end());
-    embd_inp.push_back(llama_token_middle(ctx));
+    embd_inp.push_back(llama_token_middle(model));
 
     LOG("prefix: \"%s\"\n", log_tostr(params.input_prefix));
     LOG("suffix: \"%s\"\n", log_tostr(params.input_suffix));
-    LOG("tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp));
+    LOG("tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp).c_str());
 
     // Should not run without any tokens
     if (embd_inp.empty()) {
-        embd_inp.push_back(llama_token_bos(ctx));
-        LOG("embd_inp was considered empty and bos was added: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp));
+        embd_inp.push_back(llama_token_bos(model));
+        LOG("embd_inp was considered empty and bos was added: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp).c_str());
     }
 
     // Tokenize negative prompt
@@ -273,10 +272,10 @@ int main(int argc, char ** argv) {
         LOG("cfg_negative_prompt: \"%s\"\n", log_tostr(sparams.cfg_negative_prompt));
 
         guidance_inp = ::llama_tokenize(ctx_guidance, sparams.cfg_negative_prompt, add_bos);
-        LOG("guidance_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_guidance, guidance_inp));
+        LOG("guidance_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_guidance, guidance_inp).c_str());
 
         std::vector<llama_token> original_inp = ::llama_tokenize(ctx, params.prompt, add_bos);
-        LOG("original_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, original_inp));
+        LOG("original_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, original_inp).c_str());
 
         original_prompt_len = original_inp.size();
         guidance_offset = (int)guidance_inp.size() - original_prompt_len;
@@ -294,8 +293,8 @@ int main(int argc, char ** argv) {
         params.n_keep = (int)embd_inp.size();
     }
 
-    LOG("inp_pfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, inp_pfx));
-    LOG("inp_sfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, inp_sfx));
+    LOG("inp_pfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, inp_pfx).c_str());
+    LOG("inp_sfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, inp_sfx).c_str());
 
 
     // enable interactive mode if interactive start is specified
@@ -358,39 +357,10 @@ int main(int argc, char ** argv) {
             LOG_TEE("Input suffix: '%s'\n", params.input_suffix.c_str());
         }
     }
-    LOG_TEE("sampling: repeat_last_n = %d, repeat_penalty = %f, presence_penalty = %f, frequency_penalty = %f, top_k = %d, tfs_z = %f, top_p = %f, typical_p = %f, temp = %f, mirostat = %d, mirostat_lr = %f, mirostat_ent = %f\n",
-            sparams.repeat_last_n, sparams.repeat_penalty, sparams.presence_penalty, sparams.frequency_penalty, sparams.top_k, sparams.tfs_z, sparams.top_p, sparams.typical_p, sparams.temp, sparams.mirostat, sparams.mirostat_eta, sparams.mirostat_tau);
+    LOG_TEE("sampling: \n%s\n", llama_sampling_print(sparams).c_str());
     LOG_TEE("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
     LOG_TEE("\n\n");
 
-    struct llama_grammar * grammar = NULL;
-    grammar_parser::parse_state parsed_grammar;
-
-    if (!params.grammar.empty()) {
-        parsed_grammar = grammar_parser::parse(params.grammar.c_str());
-        // will be empty (default) if there are parse errors
-        if (parsed_grammar.rules.empty()) {
-            return 1;
-        }
-        LOG_TEE("%s: grammar:\n", __func__);
-        grammar_parser::print_grammar(stderr, parsed_grammar);
-        LOG_TEE("\n");
-
-        {
-            auto it = sparams.logit_bias.find(llama_token_eos(ctx));
-            if (it != sparams.logit_bias.end() && it->second == -INFINITY) {
-                LOG_TEE("%s: warning: EOS token is disabled, which will cause most grammars to fail\n", __func__);
-            }
-        }
-
-        std::vector<const llama_grammar_element *> grammar_rules(parsed_grammar.c_rules());
-        grammar = llama_grammar_init(
-            grammar_rules.data(), grammar_rules.size(), parsed_grammar.symbol_ids.at("root"));
-    }
-
-    // TODO: replace with ring-buffer
-    std::vector<llama_token> last_tokens(n_ctx);
-    std::fill(last_tokens.begin(), last_tokens.end(), 0);
     LOG_TEE("\n#####  Infill mode  #####\n\n");
     if (params.infill) {
         printf("\n************\n");
@@ -433,11 +403,7 @@ int main(int argc, char ** argv) {
     std::vector<llama_token> embd;
     std::vector<llama_token> embd_guidance;
 
-    const int n_vocab = llama_n_vocab(model);
-
-    llama_sampling_context ctx_sampling = llama_sampling_context_init(params, grammar);
-    std::vector<llama_token_data> candidates;
-    candidates.reserve(n_vocab);
+    struct llama_sampling_context * ctx_sampling = llama_sampling_init(sparams);
 
     while (n_remain != 0 || params.interactive) {
         // predict
@@ -484,7 +450,7 @@ int main(int argc, char ** argv) {
 
                 LOG("after swap: n_past = %d, n_past_guidance = %d\n", n_past, n_past_guidance);
 
-                LOG("embd: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd));
+                LOG("embd: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd).c_str());
 
             }
 
@@ -512,7 +478,7 @@ int main(int argc, char ** argv) {
                     input_buf  = embd_guidance.data();
                     input_size = embd_guidance.size();
 
-                    LOG("guidance context: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_guidance));
+                    LOG("guidance context: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_guidance).c_str());
                 } else {
                     input_buf  = embd.data();
                     input_size = embd.size();
@@ -535,7 +501,7 @@ int main(int argc, char ** argv) {
                     n_eval = params.n_batch;
                 }
 
-                LOG("eval: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd));
+                LOG("eval: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd).c_str());
 
                 if (llama_decode(ctx, llama_batch_get_one(&embd[i], n_eval, n_past, 0))) {
                     LOG_TEE("%s : failed to eval\n", __func__);
@@ -554,12 +520,11 @@ int main(int argc, char ** argv) {
 
         if ((int) embd_inp.size() <= n_consumed && !is_interacting) {
 
-            const llama_token id = llama_sampling_sample(ctx, ctx_guidance, ctx_sampling, last_tokens, candidates);
+            const llama_token id = llama_sampling_sample(ctx_sampling, ctx, ctx_guidance);
 
-            last_tokens.erase(last_tokens.begin());
-            last_tokens.push_back(id);
+            llama_sampling_accept(ctx_sampling, ctx, id, true);
 
-            LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, last_tokens));
+            LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, ctx_sampling->prev).c_str());
 
             embd.push_back(id);
 
@@ -575,8 +540,11 @@ int main(int argc, char ** argv) {
             LOG("embd_inp.size(): %d, n_consumed: %d\n", (int) embd_inp.size(), n_consumed);
             while ((int) embd_inp.size() > n_consumed) {
                 embd.push_back(embd_inp[n_consumed]);
-                last_tokens.erase(last_tokens.begin());
-                last_tokens.push_back(embd_inp[n_consumed]);
+
+                // push the prompt in the sampling context in order to apply repetition penalties later
+                // for the prompt, we don't apply grammar rules
+                llama_sampling_accept(ctx_sampling, ctx, embd_inp[n_consumed], false);
+
                 ++n_consumed;
                 if ((int) embd.size() >= params.n_batch) {
                     break;
@@ -608,10 +576,10 @@ int main(int argc, char ** argv) {
         if ((int) embd_inp.size() <= n_consumed) {
 
             // deal with eot token in infill mode
-            if ((last_tokens.back() == llama_token_eot(ctx) || is_interacting) && params.interactive){
+            if ((llama_sampling_last(ctx_sampling) == llama_token_eot(model) || is_interacting) && params.interactive){
                 if(is_interacting && !params.interactive_first) {
                     // print an eot token
-                    printf("%s", llama_token_to_piece(ctx, llama_token_eot(ctx)).c_str());
+                    printf("%s", llama_token_to_piece(ctx, llama_token_eot(model)).c_str());
                 }
                 fflush(stdout);
                 printf("\n");
@@ -625,7 +593,7 @@ int main(int argc, char ** argv) {
                     buffer += line;
                 } while (another_line);
                 // check if we got an empty line, if so we use the old input
-                if(!buffer.empty() && !(buffer.length() == 1 && buffer[0] == '\n')) {
+                if (!buffer.empty() && !(buffer.length() == 1 && buffer[0] == '\n')) {
                     params.input_prefix = buffer;
                 }
                 buffer.clear();
@@ -635,7 +603,7 @@ int main(int argc, char ** argv) {
                     buffer += line;
                 } while (another_line);
                 // check if we got an empty line
-                if(!buffer.empty() && !(buffer.length() == 1 && buffer[0] == '\n')) {
+                if (!buffer.empty() && !(buffer.length() == 1 && buffer[0] == '\n')) {
                     params.input_suffix = buffer;
                 }
                 buffer.clear();
@@ -648,7 +616,7 @@ int main(int argc, char ** argv) {
                     process_escapes(params.input_suffix);
                 }
                 suff_rm_leading_spc = params.escape;
-                if (suff_rm_leading_spc && params.input_suffix.find_first_of(" ") == 0 && params.input_suffix.size() > 1) {
+                if (suff_rm_leading_spc && params.input_suffix.find_first_of(' ') == 0 && params.input_suffix.size() > 1) {
                     params.input_suffix.erase(0, 1);
                     suff_rm_leading_spc = false;
                 }
@@ -658,14 +626,14 @@ int main(int argc, char ** argv) {
                 if (suff_rm_leading_spc && inp_sfx[0] == space_token) {
                     inp_sfx.erase(inp_sfx.begin());
                 }
-                inp_pfx.insert(inp_pfx.begin(), llama_token_prefix(ctx));
+                inp_pfx.insert(inp_pfx.begin(), llama_token_prefix(model));
                 if (add_bos) {
-                    inp_pfx.insert(inp_pfx.begin(), llama_token_bos(ctx));
+                    inp_pfx.insert(inp_pfx.begin(), llama_token_bos(model));
                 }
-                inp_sfx.insert(inp_sfx.begin(), llama_token_suffix(ctx));
+                inp_sfx.insert(inp_sfx.begin(), llama_token_suffix(model));
                 embd_inp = inp_pfx;
                 embd_inp.insert(embd_inp.end(), inp_sfx.begin(), inp_sfx.end());
-                embd_inp.push_back(llama_token_middle(ctx));
+                embd_inp.push_back(llama_token_middle(model));
                 embd.clear();
                 embd_guidance.clear();
                 n_remain = params.n_predict;
@@ -675,7 +643,7 @@ int main(int argc, char ** argv) {
                 is_interacting = false;
             }
             // deal with end of text token in interactive mode
-            else if (last_tokens.back() == llama_token_eos(ctx)) {
+            else if (llama_sampling_last(ctx_sampling) == llama_token_eos(model)) {
                 LOG("found EOS token\n");
 
                 if (params.interactive) {
@@ -692,7 +660,7 @@ int main(int argc, char ** argv) {
 
                 if (params.input_prefix_bos) {
                     LOG("adding input prefix BOS token\n");
-                    embd_inp.push_back(llama_token_bos(ctx));
+                    embd_inp.push_back(llama_token_bos(model));
                 }
 
                 std::string buffer;
@@ -727,7 +695,7 @@ int main(int argc, char ** argv) {
                     const size_t original_size = embd_inp.size();
 
                     const auto line_inp = ::llama_tokenize(ctx, buffer, false);
-                    LOG("input tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, line_inp));
+                    LOG("input tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, line_inp).c_str());
 
                     embd_inp.insert(embd_inp.end(), line_inp.begin(), line_inp.end());
 
@@ -748,22 +716,14 @@ int main(int argc, char ** argv) {
 
             if (n_past > 0) {
                 if (is_interacting) {
-                    // reset grammar state if we're restarting generation
-                    if (grammar != NULL) {
-                        llama_grammar_free(grammar);
-
-                        std::vector<const llama_grammar_element *> grammar_rules(parsed_grammar.c_rules());
-                        grammar = llama_grammar_init(
-                            grammar_rules.data(), grammar_rules.size(),
-                            parsed_grammar.symbol_ids.at("root"));
-                    }
+                    llama_sampling_reset(ctx_sampling);
                 }
                 is_interacting = false;
             }
         }
 
         // end of text token
-        if (!embd.empty() && embd.back() == llama_token_eos(ctx) && !params.interactive) {
+        if (!embd.empty() && embd.back() == llama_token_eos(model) && !params.interactive) {
             break;
         }
 
@@ -775,7 +735,7 @@ int main(int argc, char ** argv) {
         }
     }
     if (!params.interactive && n_remain <= 0) {
-        printf("%s", llama_token_to_piece(ctx, llama_token_eot(ctx)).c_str());
+        printf("%s", llama_token_to_piece(ctx, llama_token_eot(model)).c_str());
         fflush(stdout);
     }
 
@@ -786,9 +746,7 @@ int main(int argc, char ** argv) {
     llama_free(ctx);
     llama_free_model(model);
 
-    if (grammar != NULL) {
-        llama_grammar_free(grammar);
-    }
+    llama_sampling_free(ctx_sampling);
     llama_backend_free();
 
 #ifndef LOG_DISABLE_LOGS
diff --git a/examples/llama-bench/CMakeLists.txt b/examples/llama-bench/CMakeLists.txt
index 7e395afd0..5bdbea4e2 100644
--- a/examples/llama-bench/CMakeLists.txt
+++ b/examples/llama-bench/CMakeLists.txt
@@ -3,6 +3,3 @@ add_executable(${TARGET} llama-bench.cpp)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
-if(TARGET BUILD_INFO)
-  add_dependencies(${TARGET} BUILD_INFO)
-endif()
diff --git a/examples/llama-bench/llama-bench.cpp b/examples/llama-bench/llama-bench.cpp
index a04115c96..9bd82d565 100644
--- a/examples/llama-bench/llama-bench.cpp
+++ b/examples/llama-bench/llama-bench.cpp
@@ -19,7 +19,6 @@
 #include "ggml.h"
 #include "llama.h"
 #include "common.h"
-#include "build-info.h"
 #include "ggml-cuda.h"
 
 // utils
@@ -641,8 +640,8 @@ struct test {
     }
 };
 
-const std::string test::build_commit = BUILD_COMMIT;
-const int         test::build_number = BUILD_NUMBER;
+const std::string test::build_commit = LLAMA_COMMIT;
+const int         test::build_number = LLAMA_BUILD_NUMBER;
 const bool        test::cuda         = !!ggml_cpu_has_cublas();
 const bool        test::opencl       = !!ggml_cpu_has_clblast();
 const bool        test::metal        = !!ggml_cpu_has_metal();
@@ -933,7 +932,7 @@ struct sql_printer : public printer {
 };
 
 static void test_prompt(llama_context * ctx, int n_prompt, int n_past, int n_batch, int n_threads) {
-    std::vector<llama_token> tokens(n_batch, llama_token_bos(ctx));
+    std::vector<llama_token> tokens(n_batch, llama_token_bos(llama_get_model(ctx)));
     int n_processed = 0;
 
     llama_set_n_threads(ctx, n_threads, n_threads);
@@ -946,7 +945,7 @@ static void test_prompt(llama_context * ctx, int n_prompt, int n_past, int n_bat
 }
 
 static void test_gen(llama_context * ctx, int n_gen, int n_past, int n_threads) {
-    llama_token token = llama_token_bos(ctx);
+    llama_token token = llama_token_bos(llama_get_model(ctx));
 
     llama_set_n_threads(ctx, n_threads, n_threads);
 
@@ -1037,7 +1036,7 @@ int main(int argc, char ** argv) {
 
         test t(inst, lmodel, ctx);
 
-        llama_kv_cache_tokens_rm(ctx, -1, -1);
+        llama_kv_cache_clear(ctx);
 
         // warmup run
         if (t.n_prompt > 0) {
@@ -1048,7 +1047,7 @@ int main(int argc, char ** argv) {
         }
 
         for (int i = 0; i < params.reps; i++) {
-            llama_kv_cache_tokens_rm(ctx, -1, -1);
+            llama_kv_cache_clear(ctx);
 
             uint64_t t_start = get_time_ns();
             if (t.n_prompt > 0) {
diff --git a/examples/llava/CMakeLists.txt b/examples/llava/CMakeLists.txt
new file mode 100644
index 000000000..03d32c26e
--- /dev/null
+++ b/examples/llava/CMakeLists.txt
@@ -0,0 +1,14 @@
+set(TARGET clip)
+add_library(${TARGET} clip.cpp clip.h)
+install(TARGETS ${TARGET} LIBRARY)
+target_link_libraries(${TARGET} PRIVATE common ggml ${CMAKE_THREAD_LIBS_INIT})
+target_compile_features(${TARGET} PRIVATE cxx_std_11)
+if (NOT MSVC)
+    target_compile_options(${TARGET} PRIVATE -Wno-cast-qual) # stb_image.h
+endif()
+
+set(TARGET llava)
+add_executable(${TARGET} llava.cpp)
+install(TARGETS ${TARGET} RUNTIME)
+target_link_libraries(${TARGET} PRIVATE common llama clip ${CMAKE_THREAD_LIBS_INIT})
+target_compile_features(${TARGET} PRIVATE cxx_std_11)
diff --git a/examples/llava/README.md b/examples/llava/README.md
new file mode 100644
index 000000000..fc3446b60
--- /dev/null
+++ b/examples/llava/README.md
@@ -0,0 +1,57 @@
+# LLaVA
+
+Currently this implementation supports [llava-v1.5](https://huggingface.co/liuhaotian/llava-v1.5-7b) variants.
+
+The pre-converted [7b](https://huggingface.co/mys/ggml_llava-v1.5-7b)
+and [13b](https://huggingface.co/mys/ggml_llava-v1.5-13b)
+models are available.
+
+After API is confirmed, more models will be supported / uploaded.
+
+## Usage
+Build with cmake or run `make llava` to build it.
+
+After building, run: `./llava` to see the usage. For example:
+
+```sh
+./llava -m llava-v1.5-7b/ggml-model-q5_k.gguf --mmproj llava-v1.5-7b/mmproj-model-f16.gguf --image path/to/an/image.jpg
+```
+
+**note**: A lower temperature like 0.1 is recommended for better quality. add `--temp 0.1` to the command to do so.
+
+## Model conversion
+
+- Clone `llava-v15-7b`` and `clip-vit-large-patch14-336`` locally:
+
+```sh
+git clone https://huggingface.co/liuhaotian/llava-v1.5-7b
+
+git clone https://huggingface.co/openai/clip-vit-large-patch14-336
+```
+
+2. Use `llava-surgery.py` to split the LLaVA model to LLaMA and multimodel projector constituents:
+
+```sh
+python ./examples/llava/llava-surgery.py -m ../llava-v1.5-7b
+```
+
+3. Use `convert-image-encoder-to-gguf.py` to convert the LLaVA image encoder to GGUF:
+
+```sh
+python ./examples/llava/convert-image-encoder-to-gguf -m ../clip-vit-large-patch14-336 --llava-projector ../llava-v1.5-7b/llava.projector --output-dir ../llava-v1.5-7b
+```
+
+4. Use `convert.py` to convert the LLaMA part of LLaVA to GGUF:
+
+```sh
+python ./convert.py ../llava-v1.5-7b
+```
+
+Now both the LLaMA part and the image encoder is in the `llava-v1.5-7b` directory.
+
+## TODO
+
+- [ ] Support server mode.
+- [ ] Support non-CPU backend for the image encoding part.
+- [ ] Support different sampling methods.
+- [ ] Support more model variants.
diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp
new file mode 100644
index 000000000..61932e659
--- /dev/null
+++ b/examples/llava/clip.cpp
@@ -0,0 +1,1064 @@
+// NOTE: This is modified from clip.cpp only for LLaVA,
+// so there might be still unnecessary artifacts hanging around
+// I'll gradually clean and extend it
+
+#include <cassert>
+#include <cmath>
+#include <cstdlib>
+#include <cstring>
+#include <fstream>
+#include <iostream>
+#include <map>
+#include <regex>
+#include <stdexcept>
+#include <vector>
+
+#include "clip.h"
+#include "ggml.h"
+#include "ggml-alloc.h"
+
+#define STB_IMAGE_IMPLEMENTATION
+#include "stb_image.h"
+
+#define CLIP_DEBUG
+
+static std::string format(const char * fmt, ...) {
+    va_list ap;
+    va_list ap2;
+    va_start(ap, fmt);
+    va_copy(ap2, ap);
+    int size = vsnprintf(NULL, 0, fmt, ap);
+    GGML_ASSERT(size >= 0 && size < INT_MAX); // NOLINT
+    std::vector<char> buf(size + 1);
+    int size2 = vsnprintf(buf.data(), size + 1, fmt, ap2);
+    GGML_ASSERT(size2 == size);
+    va_end(ap2);
+    va_end(ap);
+    return std::string(buf.data(), buf.size());
+}
+
+//
+// key constants
+//
+
+#define KEY_FTYPE "general.file_type"
+#define KEY_NAME "general.name"
+#define KEY_DESCRIPTION "general.description"
+#define KEY_HAS_TEXT_ENC "clip.has_text_encoder"
+#define KEY_HAS_VIS_ENC "clip.has_vision_encoder"
+#define KEY_HAS_LLAVA_PROJ "clip.has_llava_projector"
+#define KEY_USE_GELU "clip.use_gelu"
+#define KEY_N_EMBD "clip.%s.embedding_length"
+#define KEY_N_FF "clip.%s.feed_forward_length"
+#define KEY_N_BLOCK "clip.%s.block_count"
+#define KEY_N_HEAD "clip.%s.attention.head_count"
+#define KEY_LAYER_NORM_EPS "clip.%s.attention.layer_norm_epsilon"
+#define KEY_PROJ_DIM "clip.%s.projection_dim"
+#define KEY_TOKENS "tokenizer.ggml.tokens"
+#define KEY_N_POSITIONS "clip.text.context_length"
+#define KEY_IMAGE_SIZE "clip.vision.image_size"
+#define KEY_PATCH_SIZE "clip.vision.patch_size"
+#define KEY_IMAGE_MEAN "clip.vision.image_mean"
+#define KEY_IMAGE_STD "clip.vision.image_std"
+
+//
+// tensor name constants
+//
+
+#define TN_TOKEN_EMBD "%s.token_embd.weight"
+#define TN_POS_EMBD "%s.position_embd.weight"
+#define TN_CLASS_EMBD "v.class_embd"
+#define TN_PATCH_EMBD "v.patch_embd.weight"
+#define TN_ATTN_K "%s.blk.%d.attn_k.%s"
+#define TN_ATTN_Q "%s.blk.%d.attn_q.%s"
+#define TN_ATTN_V "%s.blk.%d.attn_v.%s"
+#define TN_ATTN_OUTPUT "%s.blk.%d.attn_out.%s"
+#define TN_FFN_DOWN "%s.blk.%d.ffn_down.%s"
+#define TN_FFN_UP "%s.blk.%d.ffn_up.%s"
+#define TN_LN_1 "%s.blk.%d.ln1.%s"
+#define TN_LN_2 "%s.blk.%d.ln2.%s"
+#define TN_LN_PRE "%s.pre_ln.%s"
+#define TN_LN_POST "%s.post_ln.%s"
+#define TN_TEXT_PROJ "text_projection.weight"
+#define TN_VIS_PROJ "visual_projection.weight"
+#define TN_LLAVA_PROJ "mm.%d.%s"
+
+//
+// utilities to get data from a gguf file
+//
+
+static int get_key_idx(const gguf_context * ctx, const char * key) {
+    int i = gguf_find_key(ctx, key);
+    if (i == -1) {
+        fprintf(stderr, "key %s not found in file\n", key);
+        throw std::runtime_error(format("Missing required key: %s", key));
+    }
+
+    return i;
+}
+
+static uint32_t get_u32(const gguf_context * ctx, const std::string & key) {
+    const int i = get_key_idx(ctx, key.c_str());
+
+    return gguf_get_val_u32(ctx, i);
+}
+
+static float get_f32(const gguf_context * ctx, const std::string & key) {
+    const int i = get_key_idx(ctx, key.c_str());
+
+    return gguf_get_val_f32(ctx, i);
+}
+
+static struct ggml_tensor * get_tensor(struct ggml_context * ctx, const std::string & name) {
+    struct ggml_tensor * cur = ggml_get_tensor(ctx, name.c_str());
+    if (!cur) {
+        throw std::runtime_error(format("%s: unable to find tensor %s\n", __func__, name.c_str()));
+    }
+
+    return cur;
+}
+
+static std::string get_ftype(int ftype) {
+    switch (ftype) {
+    case 0:
+        return "f32";
+    case 1:
+        return "f16";
+    case 2:
+        return "q4_0";
+    case 3:
+        return "q4_1";
+    case 6:
+        return "q5_0";
+    case 7:
+        return "q5_1";
+    case 8:
+        return "q8_0";
+    default:
+        throw std::runtime_error(format("%s: Unrecognized file type: %d\n", __func__, ftype));
+    }
+}
+
+//
+// clip layers
+//
+
+struct clip_layer {
+    // attention
+    struct ggml_tensor * k_w;
+    struct ggml_tensor * k_b;
+    struct ggml_tensor * q_w;
+    struct ggml_tensor * q_b;
+    struct ggml_tensor * v_w;
+    struct ggml_tensor * v_b;
+
+    struct ggml_tensor * o_w;
+    struct ggml_tensor * o_b;
+
+    // layernorm 1
+    struct ggml_tensor * ln_1_w;
+    struct ggml_tensor * ln_1_b;
+
+    // ff
+    struct ggml_tensor * ff_i_w;
+    struct ggml_tensor * ff_i_b;
+
+    struct ggml_tensor * ff_o_w;
+    struct ggml_tensor * ff_o_b;
+
+    // layernorm 2
+    struct ggml_tensor * ln_2_w;
+    struct ggml_tensor * ln_2_b;
+};
+
+struct clip_vision_model {
+    struct clip_vision_hparams hparams;
+
+    // embeddings
+    struct ggml_tensor * class_embedding;
+    struct ggml_tensor * patch_embeddings;
+    struct ggml_tensor * position_embeddings;
+
+    struct ggml_tensor * pre_ln_w;
+    struct ggml_tensor * pre_ln_b;
+
+    std::vector<clip_layer> layers;
+
+    struct ggml_tensor * post_ln_w;
+    struct ggml_tensor * post_ln_b;
+
+    struct ggml_tensor * projection;
+
+    // LLaVA projection
+    struct ggml_tensor * mm_0_w;
+    struct ggml_tensor * mm_0_b;
+    struct ggml_tensor * mm_2_w;
+    struct ggml_tensor * mm_2_b;
+};
+
+// Replacement for std::vector<uint8_t> that doesn't require zero-initialization.
+struct clip_buffer {
+    uint8_t * data = NULL;
+    size_t size = 0;
+
+    void resize(size_t size) {
+        delete[] data;
+        data = new uint8_t[size];
+        this->size = size;
+    }
+
+    ~clip_buffer() { delete[] data; }
+};
+
+struct clip_ctx {
+    bool has_text_encoder = false;
+    bool has_vision_encoder = false;
+    bool has_llava_projector = false;
+    struct clip_vision_model vision_model;
+    float image_mean[3];
+    float image_std[3];
+    bool use_gelu = false;
+    int32_t ftype = 1;
+    struct ggml_context * ctx;
+    struct gguf_context * ctx_gguf;
+
+    // memory buffers to evaluate the model
+    clip_buffer buf_compute;
+    clip_buffer buf_alloc;
+    ggml_allocr * alloc = NULL;
+};
+
+static ggml_cgraph * clip_image_build_graph(const clip_ctx * ctx, const clip_image_f32_batch * imgs) {
+    if (!ctx->has_vision_encoder) {
+        printf("This gguf file seems to have no vision encoder\n");
+        return nullptr;
+    }
+
+    const auto & model = ctx->vision_model;
+    const auto & hparams = model.hparams;
+
+    const int image_size = hparams.image_size;
+    const int patch_size = hparams.patch_size;
+    const int num_patches = ((image_size / patch_size) * (image_size / patch_size));
+    const int num_positions = num_patches + 1;
+    const int hidden_size = hparams.hidden_size;
+    const int n_head = hparams.n_head;
+    const int d_head = hidden_size / n_head;
+    const int n_layer = hparams.n_layer;
+    //const int n_intermediate = hparams.n_intermediate;
+    //const int projection_dim = hparams.projection_dim;
+    const float eps = hparams.eps;
+    int batch_size = imgs->size;
+    if(ctx->has_llava_projector) {
+        GGML_ASSERT(batch_size == 1);
+    }
+
+    const auto & buf_compute = ctx->buf_compute;
+
+    struct ggml_init_params params = {
+        /*.mem_size =*/ buf_compute.size,
+        /*.mem_buffer =*/ buf_compute.data,
+        /*.no_alloc =*/ false,
+    };
+
+    params.no_alloc = true;
+
+    struct ggml_context * ctx0 = ggml_init(params);
+    struct ggml_cgraph * gf = ggml_new_graph(ctx0);
+
+    struct ggml_tensor * inp_raw = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, image_size, image_size, 3, batch_size);
+    ggml_allocr_alloc(ctx->alloc, inp_raw);
+
+    if (!ggml_allocr_is_measure(ctx->alloc)) {
+        float * data = (float *)ggml_get_data(inp_raw);
+
+        for (size_t i = 0; i < imgs->size; i++) {
+            const int nx = imgs->data[i].nx;
+            const int ny = imgs->data[i].ny;
+            GGML_ASSERT(nx == image_size && ny == image_size);
+
+            const int n = nx * ny;
+
+            for (int b = 0; b < batch_size; b++) {
+                for (int k = 0; k < 3; k++) {
+                    for (int y = 0; y < ny; y++) {
+                        for (int x = 0; x < nx; x++) {
+                            data[(b * 3 * n) + k * n + y * nx + x] = imgs->data[b].data[3 * (y * nx + x) + k];
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+    struct ggml_tensor * inp = ggml_conv_2d(ctx0, model.patch_embeddings, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
+
+    inp = ggml_reshape_3d(ctx0, inp, num_patches, hidden_size, batch_size);
+    inp = ggml_cont(ctx0, ggml_permute(ctx0, inp, 1, 0, 2, 3));
+
+    // concat class_embeddings and patch_embeddings
+    struct ggml_tensor * embeddings = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, hidden_size, num_positions, batch_size);
+    ggml_allocr_alloc(ctx->alloc, embeddings);
+    if (!ggml_allocr_is_measure(ctx->alloc)) {
+        ggml_set_zero(embeddings);
+    }
+
+    struct ggml_tensor * temp = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, hidden_size, 1, batch_size);
+    ggml_allocr_alloc(ctx->alloc, temp);
+
+    embeddings = ggml_acc(ctx0, embeddings, ggml_repeat(ctx0, model.class_embedding, temp), embeddings->nb[1],
+                          embeddings->nb[2], embeddings->nb[3], 0);
+    embeddings =
+        ggml_acc(ctx0, embeddings, inp, embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], model.class_embedding->nb[1]);
+
+    struct ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_positions);
+    ggml_allocr_alloc(ctx->alloc, positions);
+    if (!ggml_allocr_is_measure(ctx->alloc)) {
+        for (int i = 0; i < num_positions; i++) {
+            ggml_set_i32_1d(positions, i, i);
+        }
+    }
+
+    embeddings =
+        ggml_add(ctx0, embeddings, ggml_repeat(ctx0, ggml_get_rows(ctx0, model.position_embeddings, positions), embeddings));
+
+    // pre-layernorm
+    {
+        embeddings = ggml_norm(ctx0, embeddings, eps);
+
+        embeddings = ggml_add(ctx0, ggml_mul(ctx0, ggml_repeat(ctx0, model.pre_ln_w, embeddings), embeddings),
+                              ggml_repeat(ctx0, model.pre_ln_b, embeddings));
+    }
+
+    struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
+    ggml_allocr_alloc(ctx->alloc, KQ_scale);
+    if (!ggml_allocr_is_measure(ctx->alloc)) {
+        ggml_set_f32(KQ_scale, 1.0f / sqrt((float)d_head));
+    }
+
+    // loop over layers
+    for (int il = 0; il < n_layer - 1; il++) {
+        struct ggml_tensor * cur = embeddings; // embeddings = residual, cur = hidden_states
+
+        //const size_t nb_q_w = model.layers[il].q_w->nb[0];
+
+        // layernorm1
+        {
+            cur = ggml_norm(ctx0, cur, eps);
+
+            cur = ggml_add(ctx0, ggml_mul(ctx0, ggml_repeat(ctx0, model.layers[il].ln_1_w, cur), cur),
+                           ggml_repeat(ctx0, model.layers[il].ln_1_b, cur));
+        }
+
+        // self-attention
+        {
+
+            struct ggml_tensor * Q =
+                ggml_add(ctx0, ggml_repeat(ctx0, model.layers[il].q_b, cur), ggml_mul_mat(ctx0, model.layers[il].q_w, cur));
+
+            Q = ggml_scale_inplace(ctx0, Q, KQ_scale);
+            Q = ggml_reshape_4d(ctx0, Q, d_head, n_head, num_positions, batch_size);
+            Q = ggml_cont(ctx0, ggml_permute(ctx0, Q, 0, 2, 1, 3));
+            Q = ggml_reshape_3d(ctx0, Q, d_head, num_positions, n_head * batch_size);
+
+            struct ggml_tensor * K =
+                ggml_add(ctx0, ggml_repeat(ctx0, model.layers[il].k_b, cur), ggml_mul_mat(ctx0, model.layers[il].k_w, cur));
+
+            K = ggml_reshape_4d(ctx0, K, d_head, n_head, num_positions, batch_size);
+            K = ggml_cont(ctx0, ggml_permute(ctx0, K, 0, 2, 1, 3));
+            K = ggml_reshape_3d(ctx0, K, d_head, num_positions, n_head * batch_size);
+
+            struct ggml_tensor * V =
+                ggml_add(ctx0, ggml_repeat(ctx0, model.layers[il].v_b, cur), ggml_mul_mat(ctx0, model.layers[il].v_w, cur));
+
+            V = ggml_reshape_4d(ctx0, V, d_head, n_head, num_positions, batch_size);
+            V = ggml_cont(ctx0, ggml_permute(ctx0, V, 1, 2, 0, 3));
+            V = ggml_reshape_3d(ctx0, V, num_positions, d_head, n_head * batch_size);
+
+            struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
+            KQ = ggml_soft_max_inplace(ctx0, KQ);
+            struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ);
+            KQV = ggml_reshape_4d(ctx0, KQV, d_head, num_positions, n_head, batch_size);
+            KQV = ggml_cont(ctx0, ggml_permute(ctx0, KQV, 0, 2, 1, 3));
+
+            cur = ggml_cpy(ctx0, KQV, ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, hidden_size, num_positions, batch_size));
+        }
+
+        // attention output
+        cur = ggml_add(ctx0, ggml_repeat(ctx0, model.layers[il].o_b, cur), ggml_mul_mat(ctx0, model.layers[il].o_w, cur));
+
+        // re-add the layer input, e.g., residual
+        cur = ggml_add(ctx0, cur, embeddings);
+
+        embeddings = cur; // embeddings = residual, cur = hidden_states
+
+        // layernorm2
+        {
+            cur = ggml_norm(ctx0, cur, eps);
+
+            cur = ggml_add(ctx0, ggml_mul(ctx0, ggml_repeat(ctx0, model.layers[il].ln_2_w, cur), cur),
+                           ggml_repeat(ctx0, model.layers[il].ln_2_b, cur));
+        }
+
+        cur = ggml_mul_mat(ctx0, model.layers[il].ff_i_w, cur);
+        cur = ggml_add(ctx0, ggml_repeat(ctx0, model.layers[il].ff_i_b, cur), cur);
+
+        if (ctx->use_gelu) {
+            cur = ggml_gelu_inplace(ctx0, cur);
+        } else {
+            cur = ggml_gelu_quick_inplace(ctx0, cur);
+        }
+
+        cur = ggml_mul_mat(ctx0, model.layers[il].ff_o_w, cur);
+        cur = ggml_add(ctx0, ggml_repeat(ctx0, model.layers[il].ff_o_b, cur), cur);
+
+        // residual 2
+        cur = ggml_add(ctx0, embeddings, cur);
+
+        embeddings = cur;
+    }
+
+    // llava projector
+    {
+        embeddings = ggml_reshape_2d(ctx0, embeddings, embeddings->ne[0], embeddings->ne[1]);
+
+        struct ggml_tensor * patches = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_patches);
+        ggml_allocr_alloc(ctx->alloc, patches);
+        if (!ggml_allocr_is_measure(ctx->alloc)) {
+            for (int i = 0; i < num_patches; ++i) {
+                ggml_set_i32_1d(patches, i, i+1);
+            }
+        }
+
+        embeddings = ggml_get_rows(ctx0, embeddings, patches);
+
+        // mm projection 0
+        embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings);
+        embeddings = ggml_add(ctx0, ggml_repeat(ctx0, model.mm_0_b, embeddings), embeddings);
+
+        embeddings = ggml_gelu(ctx0, embeddings);
+
+        embeddings = ggml_mul_mat(ctx0, model.mm_2_w, embeddings);
+        embeddings = ggml_add(ctx0, ggml_repeat(ctx0, model.mm_2_b, embeddings), embeddings);
+    }
+
+    // build the graph
+    ggml_build_forward_expand(gf, embeddings);
+
+    ggml_free(ctx0);
+
+    return gf;
+}
+
+// read and create ggml_context containing the tensors and their data
+struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
+
+    struct ggml_context * meta = NULL;
+
+    struct gguf_init_params params = {
+        /*.no_alloc = */ true,
+        /*.ctx      = */ &meta,
+    };
+
+    struct gguf_context * ctx = gguf_init_from_file(fname, params);
+    if (!ctx) {
+        throw std::runtime_error(format("%s: failed to load CLIP model from %s. Does this file exist?\n", __func__, fname));
+    }
+
+    if (verbosity >= 1) {
+        const int n_tensors = gguf_get_n_tensors(ctx);
+        const int n_kv = gguf_get_n_kv(ctx);
+        const int ftype = get_u32(ctx, KEY_FTYPE);
+        const std::string ftype_str = get_ftype(ftype);
+        const int idx_desc = get_key_idx(ctx, KEY_DESCRIPTION);
+        const std::string description = gguf_get_val_str(ctx, idx_desc);
+        const int idx_name = gguf_find_key(ctx, KEY_NAME);
+        if (idx_name != -1) { // make name optional temporarily as some of the uploaded models missing it due to a bug
+            const std::string name = gguf_get_val_str(ctx, idx_name);
+            printf("%s: model name:   %s\n", __func__, name.c_str());
+        }
+        printf("%s: description:  %s\n", __func__, description.c_str());
+        printf("%s: GGUF version: %d\n", __func__, gguf_get_version(ctx));
+        printf("%s: alignment:    %zu\n", __func__, gguf_get_alignment(ctx));
+        printf("%s: n_tensors:    %d\n", __func__, n_tensors);
+        printf("%s: n_kv:         %d\n", __func__, n_kv);
+        printf("%s: ftype:        %s\n", __func__, ftype_str.c_str());
+        printf("\n");
+    }
+
+    // kv
+    if (verbosity >= 3) {
+        const int n_kv = gguf_get_n_kv(ctx);
+
+        for (int i = 0; i < n_kv; ++i) {
+            const char * key = gguf_get_key(ctx, i);
+
+            printf("%s: kv[%d]: key = %s\n", __func__, i, key);
+        }
+        printf("\n");
+    }
+
+    // data
+    size_t ctx_size = 0;
+    {
+        const int n_tensors = gguf_get_n_tensors(ctx);
+
+        for (int i = 0; i < n_tensors; ++i) {
+            const char * name = gguf_get_tensor_name(ctx, i);
+            const size_t offset = gguf_get_tensor_offset(ctx, i);
+
+            struct ggml_tensor * cur = ggml_get_tensor(meta, name);
+            ctx_size += sizeof(struct ggml_tensor) + GGML_OBJECT_SIZE;
+            size_t tensor_size = ggml_nbytes(cur);
+            size_t padded_size = ggml_nbytes_pad(cur);
+            ctx_size += padded_size;
+            if (verbosity >= 3) {
+                printf("%s: tensor[%d]: n_dims = %d, name = %s, tensor_size=%zu, padded_size=%zu, offset=%zu\n", __func__, i,
+                       cur->n_dims, cur->name, tensor_size, padded_size, offset);
+            }
+        }
+    }
+
+    clip_ctx * new_clip = new clip_ctx;
+
+    // model size and capabilities
+    {
+        int idx = get_key_idx(ctx, KEY_HAS_TEXT_ENC);
+        new_clip->has_text_encoder = gguf_get_val_bool(ctx, idx);
+
+        idx = get_key_idx(ctx, KEY_HAS_VIS_ENC);
+        new_clip->has_vision_encoder = gguf_get_val_bool(ctx, idx);
+
+        idx = gguf_find_key(ctx, KEY_HAS_LLAVA_PROJ);
+        if (idx != -1) {
+            new_clip->has_llava_projector = gguf_get_val_bool(ctx, idx);
+        }
+
+        GGML_ASSERT(new_clip->has_llava_projector); // see monatis/clip.cpp for image and/or text encoding for semantic search
+        GGML_ASSERT(new_clip->has_vision_encoder);
+        GGML_ASSERT(!new_clip->has_text_encoder);
+
+        idx = get_key_idx(ctx, KEY_USE_GELU);
+        new_clip->use_gelu = gguf_get_val_bool(ctx, idx);
+
+        if (verbosity >= 1) {
+            printf("%s: text_encoder:   %d\n", __func__, new_clip->has_text_encoder);
+            printf("%s: vision_encoder: %d\n", __func__, new_clip->has_vision_encoder);
+            printf("%s: llava_projector:  %d\n", __func__, new_clip->has_llava_projector);
+            printf("%s: model size:     %.2f MB\n", __func__, (ctx_size / 1024.0 / 1024.0));
+            printf("%s: metadata size:  %.2f MB\n", __func__, ggml_get_mem_size(meta) / 1024.0 / 1024.0);
+        }
+    }
+
+    // load tensors
+    {
+        struct ggml_init_params params = {
+            /*.mem_size =*/ ctx_size,
+            /*.mem_buffer =*/ NULL,
+            /*.no_alloc =*/ false,
+        };
+
+        new_clip->ctx = ggml_init(params);
+        if (!new_clip->ctx) {
+            fprintf(stderr, "%s: ggml_init() failed\n", __func__);
+            clip_free(new_clip);
+            return nullptr;
+        }
+
+        auto fin = std::ifstream(fname, std::ios::binary);
+        if (!fin) {
+            printf("cannot open model file for loading tensors\n");
+            clip_free(new_clip);
+            return nullptr;
+        }
+
+        const int n_tensors = gguf_get_n_tensors(ctx);
+        for (int i = 0; i < n_tensors; ++i) {
+            const char * name = gguf_get_tensor_name(ctx, i);
+            struct ggml_tensor * t = ggml_get_tensor(meta, name);
+            struct ggml_tensor * cur = ggml_dup_tensor(new_clip->ctx, t);
+            ggml_set_name(cur, name);
+
+            const size_t offset = gguf_get_data_offset(ctx) + gguf_get_tensor_offset(ctx, i);
+            fin.seekg(offset, std::ios::beg);
+            if (!fin) {
+                printf("%s: failed to seek for tensor %s\n", __func__, name);
+                clip_free(new_clip);
+                return nullptr;
+            }
+
+            fin.read(reinterpret_cast<char *>(cur->data), ggml_nbytes(t));
+        }
+
+        fin.close();
+    }
+
+    // vision model
+    if (new_clip->has_vision_encoder) {
+        // load vision model
+        auto & vision_model = new_clip->vision_model;
+        auto & hparams = vision_model.hparams;
+        hparams.hidden_size = get_u32(ctx, format(KEY_N_EMBD, "vision"));
+        hparams.n_head = get_u32(ctx, format(KEY_N_HEAD, "vision"));
+        hparams.n_intermediate = get_u32(ctx, format(KEY_N_FF, "vision"));
+        hparams.n_layer = get_u32(ctx, format(KEY_N_BLOCK, "vision"));
+        hparams.image_size = get_u32(ctx, KEY_IMAGE_SIZE);
+        hparams.patch_size = get_u32(ctx, KEY_PATCH_SIZE);
+        hparams.projection_dim = get_u32(ctx, format(KEY_PROJ_DIM, "vision"));
+        hparams.eps = get_f32(ctx, format(KEY_LAYER_NORM_EPS, "vision"));
+
+        int idx_mean = get_key_idx(ctx, KEY_IMAGE_MEAN);
+        int idx_std = get_key_idx(ctx, KEY_IMAGE_STD);
+        for (int i = 0; i < 3; ++i) {
+            new_clip->image_mean[i] = *((const float *)gguf_get_arr_data(ctx, idx_mean));
+            new_clip->image_std[i] = *((const float *)gguf_get_arr_data(ctx, idx_std));
+        }
+
+        if (verbosity >= 2) {
+            printf("\n%s: vision model hparams\n", __func__);
+            printf("image_size         %d\n", hparams.image_size);
+            printf("patch_size         %d\n", hparams.patch_size);
+            printf("v_hidden_size      %d\n", hparams.hidden_size);
+            printf("v_n_intermediate   %d\n", hparams.n_intermediate);
+            printf("v_projection_dim   %d\n", hparams.projection_dim);
+            printf("v_n_head           %d\n", hparams.n_head);
+            printf("v_n_layer          %d\n", hparams.n_layer);
+        }
+
+        vision_model.patch_embeddings = get_tensor(new_clip->ctx, TN_PATCH_EMBD);
+        vision_model.class_embedding = get_tensor(new_clip->ctx, TN_CLASS_EMBD);
+        vision_model.position_embeddings = get_tensor(new_clip->ctx, format(TN_POS_EMBD, "v"));
+        vision_model.pre_ln_w = get_tensor(new_clip->ctx, format(TN_LN_PRE, "v", "weight"));
+        vision_model.pre_ln_b = get_tensor(new_clip->ctx, format(TN_LN_PRE, "v", "bias"));
+        vision_model.mm_0_w = get_tensor(new_clip->ctx, format(TN_LLAVA_PROJ, 0, "weight"));
+        vision_model.mm_0_b = get_tensor(new_clip->ctx, format(TN_LLAVA_PROJ, 0, "bias"));
+        vision_model.mm_2_w = get_tensor(new_clip->ctx, format(TN_LLAVA_PROJ, 2, "weight"));
+        vision_model.mm_2_b = get_tensor(new_clip->ctx, format(TN_LLAVA_PROJ, 2, "bias"));
+
+        vision_model.layers.resize(hparams.n_layer);
+        for (int il = 0; il < hparams.n_layer; ++il) {
+            auto & layer = vision_model.layers[il];
+            layer.k_w = get_tensor(new_clip->ctx, format(TN_ATTN_K, "v", il, "weight"));
+            layer.q_w = get_tensor(new_clip->ctx, format(TN_ATTN_Q, "v", il, "weight"));
+            layer.v_w = get_tensor(new_clip->ctx, format(TN_ATTN_V, "v", il, "weight"));
+            layer.o_w = get_tensor(new_clip->ctx, format(TN_ATTN_OUTPUT, "v", il, "weight"));
+            layer.ln_1_w = get_tensor(new_clip->ctx, format(TN_LN_1, "v", il, "weight"));
+            layer.ln_2_w = get_tensor(new_clip->ctx, format(TN_LN_2, "v", il, "weight"));
+            layer.ff_i_w = get_tensor(new_clip->ctx, format(TN_FFN_DOWN, "v", il, "weight"));
+            layer.ff_o_w = get_tensor(new_clip->ctx, format(TN_FFN_UP, "v", il, "weight"));
+            layer.k_b = get_tensor(new_clip->ctx, format(TN_ATTN_K, "v", il, "bias"));
+            layer.q_b = get_tensor(new_clip->ctx, format(TN_ATTN_Q, "v", il, "bias"));
+            layer.v_b = get_tensor(new_clip->ctx, format(TN_ATTN_V, "v", il, "bias"));
+            layer.o_b = get_tensor(new_clip->ctx, format(TN_ATTN_OUTPUT, "v", il, "bias"));
+            layer.ln_1_b = get_tensor(new_clip->ctx, format(TN_LN_1, "v", il, "bias"));
+            layer.ln_2_b = get_tensor(new_clip->ctx, format(TN_LN_2, "v", il, "bias"));
+            layer.ff_i_b = get_tensor(new_clip->ctx, format(TN_FFN_DOWN, "v", il, "bias"));
+            layer.ff_o_b = get_tensor(new_clip->ctx, format(TN_FFN_UP, "v", il, "bias"));
+        }
+    }
+
+    ggml_free(meta);
+
+    new_clip->ctx_gguf = ctx;
+
+// measure mem requirement and allocate
+    {
+        static const size_t tensor_alignment = 32;
+        new_clip->buf_compute.resize(ggml_tensor_overhead()*GGML_MAX_NODES + ggml_graph_overhead());
+        new_clip->alloc = ggml_allocr_new_measure(tensor_alignment);
+        clip_image_f32_batch batch;
+        batch.size = 1;
+        ggml_cgraph * gf = clip_image_build_graph(new_clip, &batch);
+        size_t alloc_size = ggml_allocr_alloc_graph(new_clip->alloc, gf) + tensor_alignment;
+        ggml_allocr_free(new_clip->alloc);
+        new_clip->buf_alloc.resize(alloc_size);
+        new_clip->alloc = ggml_allocr_new(new_clip->buf_alloc.data, new_clip->buf_alloc.size, tensor_alignment);
+
+        printf("%s: total allocated memory: %.2f MB\n", __func__, (new_clip->buf_compute.size + alloc_size)/1024.0/1024.0);
+    }
+
+    return new_clip;
+}
+
+clip_image_u8 * make_clip_image_u8() { return new clip_image_u8(); }
+
+clip_image_f32 * make_clip_image_f32() { return new clip_image_f32(); }
+
+bool clip_image_load_from_file(const char * fname, clip_image_u8 * img) {
+    int nx, ny, nc;
+    auto data = stbi_load(fname, &nx, &ny, &nc, 3);
+    if (!data) {
+        fprintf(stderr, "%s: failed to load '%s'\n", __func__, fname);
+        return false;
+    }
+
+    img->nx = nx;
+    img->ny = ny;
+    img->size = nx * ny * 3;
+    img->data = new uint8_t[img->size]();
+    memcpy(img->data, data, img->size);
+
+    stbi_image_free(data);
+
+    return true;
+}
+
+// normalize: x = (x - mean) / std
+// TODO: implement bicubic interpolation instead of linear.
+bool clip_image_preprocess(const clip_ctx * ctx, const clip_image_u8 * img, clip_image_f32 * res, const bool pad2square) {
+    if (!ctx->has_vision_encoder) {
+        printf("This gguf file seems to have no vision encoder\n");
+        return false;
+    }
+
+    // the logic below is to pad the shorter side to the longer side with a background color: rgb(122, 116, 104)
+    // see https://github.com/haotian-liu/LLaVA/blob/e854a2bf85118c504f6f16bf5c3c7c92f8fa8c6b/llava/conversation.py#L113-L156
+
+    clip_image_u8 temp; // we will keep the input image data here temporarily
+    if (pad2square && img->nx != img->ny) {
+        int longer_side = std::max(img->nx, img->ny);
+        temp.nx = longer_side;
+        temp.ny = longer_side;
+        temp.size = 3 * longer_side * longer_side;
+        temp.data = new uint8_t[temp.size]();
+        uint8_t bc[3] = {122, 116, 104}; // bakground color in RGB from LLaVA
+
+        // fill with background color
+        for (size_t i = 0; i < temp.size; i++) {
+            temp.data[i] = bc[i % 3];
+        }
+
+        // copy from the input image
+        for (int y = 0; y < img->ny; y++) {
+            for (int x = 0; x < img->nx; x++) {
+                const int i = 3 * (y * img->nx + x);
+                const int j = 3 * (y * temp.nx + x);
+                temp.data[j] = img->data[i];
+                temp.data[j+1] = img->data[i+1];
+                temp.data[j+2] = img->data[i+2];
+            }
+        }
+    } else {
+        temp.nx   = img->nx;
+        temp.ny   = img->ny;
+        temp.size = img->size;
+        temp.data = img->data;
+    }
+
+    const int nx = temp.nx;
+    const int ny = temp.ny;
+
+    const int nx2 = ctx->vision_model.hparams.image_size;
+    const int ny2 = ctx->vision_model.hparams.image_size;
+
+    res->nx = nx2;
+    res->ny = ny2;
+    res->size = 3 * nx2 * ny2;
+    res->data = new float[res->size]();
+
+    const float scale = std::max(nx, ny) / (float)ctx->vision_model.hparams.image_size;
+
+    const int nx3 = int(nx / scale + 0.5f);
+    const int ny3 = int(ny / scale + 0.5f);
+
+    const auto & m3 = ctx->image_mean; // {0.48145466f, 0.4578275f, 0.40821073f};
+    const auto & s3 = ctx->image_std;  // {0.26862954f, 0.26130258f, 0.27577711f};
+
+    for (int y = 0; y < ny3; y++) {
+        for (int x = 0; x < nx3; x++) {
+            for (int c = 0; c < 3; c++) {
+                // linear interpolation
+                const float sx = (x + 0.5f) * scale - 0.5f;
+                const float sy = (y + 0.5f) * scale - 0.5f;
+
+                const int x0 = std::max(0, (int)std::floor(sx));
+                const int y0 = std::max(0, (int)std::floor(sy));
+
+                const int x1 = std::min(x0 + 1, nx - 1);
+                const int y1 = std::min(y0 + 1, ny - 1);
+
+                const float dx = sx - x0;
+                const float dy = sy - y0;
+
+                const int j00 = 3 * (y0 * nx + x0) + c;
+                const int j01 = 3 * (y0 * nx + x1) + c;
+                const int j10 = 3 * (y1 * nx + x0) + c;
+                const int j11 = 3 * (y1 * nx + x1) + c;
+
+                const float v00 = temp.data[j00];
+                const float v01 = temp.data[j01];
+                const float v10 = temp.data[j10];
+                const float v11 = temp.data[j11];
+
+                const float v0 = v00 * (1.0f - dx) + v01 * dx;
+                const float v1 = v10 * (1.0f - dx) + v11 * dx;
+
+                const float v = v0 * (1.0f - dy) + v1 * dy;
+
+                const uint8_t v2 = std::min(std::max(std::round(v), 0.0f), 255.0f);
+
+                const int i = 3 * (y * nx3 + x) + c;
+
+                res->data[i] = ((float(v2) / 255.0f) - m3[c]) / s3[c];
+            }
+        }
+    }
+
+    return true;
+}
+
+void clip_free(clip_ctx * ctx) {
+    ggml_free(ctx->ctx);
+    gguf_free(ctx->ctx_gguf);
+    delete ctx;
+}
+
+bool clip_image_encode(const clip_ctx * ctx, const int n_threads, clip_image_f32 * img, float * vec) {
+    if (!ctx->has_vision_encoder) {
+        printf("This gguf file seems to have no vision encoder\n");
+        return false;
+    }
+
+    clip_image_f32_batch imgs{};
+    imgs.size = 1;
+    imgs.data = img;
+    return clip_image_batch_encode(ctx, n_threads, &imgs, vec);
+}
+
+bool clip_image_batch_encode(const clip_ctx * ctx, const int n_threads, const clip_image_f32_batch * imgs, float * vec) {
+
+    if (!ctx->has_vision_encoder) {
+        printf("This gguf file seems to have no vision encoder\n");
+        return false;
+    }
+
+    int batch_size = imgs->size;
+    if(ctx->has_llava_projector) {
+        GGML_ASSERT(batch_size == 1); // TODO: support multiple images
+    }
+
+    // reset alloc buffer to clean the memory from previous invocations
+    ggml_allocr_reset(ctx->alloc);
+
+    // build the inference graph
+    ggml_cgraph * gf = clip_image_build_graph(ctx, imgs);
+    ggml_allocr_alloc_graph(ctx->alloc, gf);
+
+    struct ggml_cplan plan = ggml_graph_plan(gf, n_threads);
+    if (plan.work_size > 0) {
+        plan.work_data = (uint8_t *)malloc(plan.work_size);
+    }
+
+    ggml_graph_compute(gf, &plan);
+
+    // the last node is the embedding tensor
+struct ggml_tensor * embeddings = gf->nodes[gf->n_nodes - 1];
+
+    // copy the embeddings to the location passed by the user
+    memcpy(vec, ggml_get_data_f32(embeddings), ggml_nbytes(embeddings));
+
+    if (plan.work_size > 0) {
+        free(plan.work_data);
+    }
+
+    return true;
+}
+
+bool clip_model_quantize(const char * fname_inp, const char * fname_out, const int itype) {
+
+    ggml_type type = GGML_TYPE_Q4_1;
+
+    switch (itype) {
+    case 2:
+        type = GGML_TYPE_Q4_0;
+        break;
+    case 3:
+        type = GGML_TYPE_Q4_1;
+        break;
+    case 6:
+        type = GGML_TYPE_Q5_0;
+        break;
+    case 7:
+        type = GGML_TYPE_Q5_1;
+        break;
+    case 8:
+        type = GGML_TYPE_Q8_0;
+        break;
+    default:
+        fprintf(stderr, "%s: invalid quantization type %d\n", __func__, itype);
+        return false;
+    };
+
+    auto ctx_clip = clip_model_load(fname_inp, 2);
+    const auto & ctx_src = ctx_clip->ctx_gguf;
+    const auto & ctx_data = ctx_clip->ctx;
+
+    auto ctx_out = gguf_init_empty();
+    gguf_set_kv(ctx_out, ctx_src);
+    gguf_set_val_u32(ctx_out, "general.quantization_version", GGML_QNT_VERSION);
+    gguf_set_val_u32(ctx_out, "general.file_type", itype);
+
+    auto fout = std::ofstream(fname_out, std::ios::binary);
+
+    const int n_tensors = gguf_get_n_tensors(ctx_src);
+
+    for (int i = 0; i < n_tensors; ++i) {
+        const char * name = gguf_get_tensor_name(ctx_src, i);
+        struct ggml_tensor * cur = ggml_get_tensor(ctx_data, name);
+        gguf_add_tensor(ctx_out, cur);
+    }
+
+    const size_t meta_size = gguf_get_meta_size(ctx_out);
+    for (size_t i = 0; i < meta_size; ++i) {
+        fout.put(0);
+    }
+
+    // regexes of tensor names to be quantized
+    const std::vector<std::string> k_names = {
+        ".*weight",
+    };
+
+    std::vector<uint8_t> read_data(512);
+    std::vector<uint8_t> work(512);
+    std::vector<float> conv_buf(512);
+    std::vector<int64_t> hist_all(1 << 4, 0);
+    size_t total_size_org = 0;
+    size_t total_size_new = 0;
+
+    for (int i = 0; i < n_tensors; ++i) {
+        const std::string name = gguf_get_tensor_name(ctx_src, i);
+        struct ggml_tensor * cur = ggml_get_tensor(ctx_data, name.c_str());
+
+        enum ggml_type new_type;
+        void * new_data;
+        size_t new_size;
+
+        bool quantize = false;
+        for (const auto & s : k_names) {
+            if (std::regex_match(name, std::regex(s))) {
+                quantize = true;
+                break;
+            }
+        }
+
+        // quantize only 2D tensors
+        quantize &= (cur->n_dims == 2);
+
+        if (quantize) {
+            new_type = type;
+            const size_t n_elms = ggml_nelements(cur);
+            float * f32_data;
+
+            switch (cur->type) {
+            case GGML_TYPE_F32:
+                f32_data = (float *)cur->data;
+                break;
+            case GGML_TYPE_F16:
+                if (conv_buf.size() < n_elms) {
+                    conv_buf.resize(n_elms);
+                }
+                for (size_t j = 0; j < n_elms; ++j) {
+                    conv_buf[j] = ggml_fp16_to_fp32(((ggml_fp16_t *)cur->data)[j]);
+                }
+                f32_data = (float *)conv_buf.data();
+                break;
+            default:
+                printf("Please use an input file in f32 or f16\n");
+                return false;
+            }
+
+            if (work.size() < n_elms * 4) {
+                work.resize(n_elms * 4);
+            }
+            new_data = work.data();
+
+            std::vector<int64_t> hist_cur(1 << 4, 0);
+
+            switch (new_type) {
+                case GGML_TYPE_Q4_0: {
+                    new_size = ggml_quantize_q4_0(f32_data, new_data, n_elms, cur->ne[0], hist_cur.data());
+                } break;
+                case GGML_TYPE_Q4_1: {
+                    new_size = ggml_quantize_q4_1(f32_data, new_data, n_elms, cur->ne[0], hist_cur.data());
+                } break;
+                case GGML_TYPE_Q5_0: {
+                    new_size = ggml_quantize_q5_0(f32_data, new_data, n_elms, cur->ne[0], hist_cur.data());
+                } break;
+                case GGML_TYPE_Q5_1: {
+                    new_size = ggml_quantize_q5_1(f32_data, new_data, n_elms, cur->ne[0], hist_cur.data());
+                } break;
+                case GGML_TYPE_Q8_0: {
+                    new_size = ggml_quantize_q8_0(f32_data, new_data, n_elms, cur->ne[0], hist_cur.data());
+                } break;
+                default: {
+                    fprintf(stderr, "%s: unsupported quantization type %d\n", __func__, new_type);
+                    return false;
+                }
+            }
+
+            for (size_t j = 0; j < hist_cur.size(); ++j) {
+                hist_all[j] += hist_cur[j];
+            }
+        } else {
+            new_type = cur->type;
+            new_data = cur->data;
+            new_size = ggml_nbytes(cur);
+        }
+        const size_t orig_size = ggml_nbytes(cur);
+        total_size_org += orig_size;
+        total_size_new += new_size;
+        gguf_set_tensor_type(ctx_out, name.c_str(), new_type);
+        gguf_set_tensor_data(ctx_out, name.c_str(), new_data, new_size);
+        fout.write((const char *)new_data, new_size);
+        size_t pad = GGML_PAD(new_size, gguf_get_alignment(ctx_out)) - new_size;
+        for (size_t j = 0; j < pad; ++j) {
+            fout.put(0);
+        }
+
+        printf("%s: n_dims = %d | quantize=%d | size = %f MB -> %f MB\n", name.c_str(), cur->n_dims, quantize,
+               orig_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
+    }
+
+    // go back to beginning of file and write the updated metadata
+    fout.seekp(0, std::ios::beg);
+    std::vector<uint8_t> meta(meta_size);
+    gguf_get_meta_data(ctx_out, meta.data());
+    fout.write((const char *)meta.data(), meta_size);
+
+    fout.close();
+
+    clip_free(ctx_clip);
+    gguf_free(ctx_out);
+
+    {
+        printf("%s: original size  = %8.2f MB\n", __func__, total_size_org / 1024.0 / 1024.0);
+        printf("%s: quantized size  = %8.2f MB\n", __func__, total_size_new / 1024.0 / 1024.0);
+
+        int64_t sum_all = 0;
+        for (size_t i = 0; i < hist_all.size(); ++i) {
+            sum_all += hist_all[i];
+        }
+
+        printf("%s: hist: ", __func__);
+        for (size_t i = 0; i < hist_all.size(); ++i) {
+            printf("%5.3f ", hist_all[i] / (float)sum_all);
+        }
+        printf("\n");
+    }
+
+    return true;
+}
+
+int clip_n_mmproj_embd(struct clip_ctx * ctx) {
+    return ctx->vision_model.mm_2_b->ne[0];
+}
+
+int clip_n_patches(struct clip_ctx * ctx) {
+    auto & params = ctx->vision_model.hparams;
+
+    return (params.image_size / params.patch_size) * (params.image_size / params.patch_size);
+}
+
+size_t clip_embd_nbytes(struct clip_ctx * ctx) {
+    return clip_n_patches(ctx) * clip_n_mmproj_embd(ctx) * sizeof(float);
+}
diff --git a/examples/llava/clip.h b/examples/llava/clip.h
new file mode 100644
index 000000000..3d7261e29
--- /dev/null
+++ b/examples/llava/clip.h
@@ -0,0 +1,73 @@
+#ifndef CLIP_H
+#define CLIP_H
+
+#include "ggml.h"
+
+struct clip_ctx;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct clip_vision_hparams {
+    int32_t image_size;
+    int32_t patch_size;
+    int32_t hidden_size;
+    int32_t n_intermediate;
+    int32_t projection_dim;
+    int32_t n_head;
+    int32_t n_layer;
+    float eps;
+};
+
+struct clip_ctx * clip_model_load(const char * fname, const int verbosity);
+
+void clip_free(struct clip_ctx * ctx);
+
+size_t clip_embd_nbytes(struct clip_ctx * ctx);
+int clip_n_patches(struct clip_ctx * ctx);
+int clip_n_mmproj_embd(struct clip_ctx * ctx);
+
+// RGB uint8 image
+struct clip_image_u8 {
+    int nx;
+    int ny;
+    uint8_t * data;
+    size_t size;
+};
+
+// RGB float32 image (NHWC)
+// Memory layout: RGBRGBRGB...
+struct clip_image_f32 {
+    int nx;
+    int ny;
+    float * data;
+    size_t size;
+};
+
+struct clip_image_u8_batch {
+    struct clip_image_u8 * data;
+    size_t size;
+};
+
+struct clip_image_f32_batch {
+    struct clip_image_f32 * data;
+    size_t size;
+};
+
+struct clip_image_u8 * make_clip_image_u8();
+struct clip_image_f32 * make_clip_image_f32();
+bool clip_image_load_from_file(const char * fname, struct clip_image_u8 * img);
+bool clip_image_preprocess(const struct clip_ctx * ctx, const struct clip_image_u8 * img, struct clip_image_f32 * res, const bool pad2square);
+bool clip_image_encode(const struct clip_ctx * ctx, const int n_threads, struct clip_image_f32 * img, float * vec);
+
+bool clip_image_batch_encode(const struct clip_ctx * ctx, const int n_threads, const struct clip_image_f32_batch * imgs,
+                             float * vec);
+
+bool clip_model_quantize(const char * fname_inp, const char * fname_out, const int itype);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // CLIP_H
diff --git a/examples/llava/convert-image-encoder-to-gguf.py b/examples/llava/convert-image-encoder-to-gguf.py
new file mode 100644
index 000000000..2f5eef199
--- /dev/null
+++ b/examples/llava/convert-image-encoder-to-gguf.py
@@ -0,0 +1,250 @@
+import argparse
+import os
+import json
+
+import torch
+import numpy as np
+from gguf import *
+from transformers import CLIPModel, CLIPProcessor
+
+TEXT = "clip.text"
+VISION = "clip.vision"
+
+
+def k(raw_key: str, arch: str) -> str:
+    return raw_key.format(arch=arch)
+
+
+def should_skip_tensor(name: str, has_text: bool, has_vision: bool, has_llava: bool) -> bool:
+    if name in (
+        "logit_scale",
+        "text_model.embeddings.position_ids",
+        "vision_model.embeddings.position_ids",
+    ):
+        return True
+
+    if has_llava and name in ["visual_projection.weight", "vision_model.post_layernorm.weight", "vision_model.post_layernorm.bias"]:
+        return True
+
+    if name.startswith("v") and not has_vision:
+        return True
+
+    if name.startswith("t") and not has_text:
+        return True
+
+    return False
+
+
+def get_tensor_name(name: str) -> str:
+    if "projection" in name:
+        return name
+
+    if "mm_projector" in name:
+        return name.replace("model.mm_projector", "mm")
+
+    return name.replace("text_model", "t").replace("vision_model", "v").replace("encoder.layers", "blk").replace("embeddings.", "").replace("_proj", "").replace("self_attn.", "attn_").replace("layer_norm", "ln").replace("layernorm", "ln").replace("mlp.fc1", "ffn_down").replace("mlp.fc2", "ffn_up").replace("embedding", "embd").replace("final", "post").replace("layrnorm", "ln")
+
+
+def bytes_to_unicode():
+    """
+    Returns list of utf-8 byte and a corresponding list of unicode strings.
+    The reversible bpe codes work on unicode strings.
+    This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
+    When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
+    This is a signficant percentage of your normal, say, 32K bpe vocab.
+    To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
+    And avoids mapping to whitespace/control characters the bpe code barfs on.
+    """
+    bs = (
+        list(range(ord("!"), ord("~") + 1))
+        + list(range(ord("¡"), ord("¬") + 1))
+        + list(range(ord("®"), ord("ÿ") + 1))
+    )
+    cs = bs[:]
+    n = 0
+    for b in range(2**8):
+        if b not in bs:
+            bs.append(b)
+            cs.append(2**8 + n)
+            n += 1
+    cs = [chr(n) for n in cs]
+    return dict(zip(bs, cs))
+
+
+ap = argparse.ArgumentParser(prog="convert_hf_to_gguf.py")
+ap.add_argument("-m", "--model-dir", help="Path to model directory cloned from HF Hub", required=True)
+ap.add_argument("--use-f32", action="store_true", default=False, help="Use f32 instead of f16")
+ap.add_argument("--text-only", action="store_true", required=False,
+                help="Save a text-only model. It can't be used to encode images")
+ap.add_argument("--vision-only", action="store_true", required=False,
+                help="Save a vision-only model. It can't be used to encode texts")
+ap.add_argument("--llava-projector", help="Path to llava.projector file. If specified, save an image encoder for LLaVA models.")
+ap.add_argument("--image-mean", nargs=3, type=float, required=False, help="Override image mean values")
+ap.add_argument("--image-std", nargs=3, type=float, required=False, help="Override image std values")
+ap.add_argument("-o", "--output-dir", help="Directory to save GGUF files. Default is the original model directory", default=None)
+
+args = ap.parse_args()
+
+
+if args.text_only and args.vision_only:
+    print("--text-only and --image-only arguments cannot be specified at the same time.")
+    exit(1)
+
+if args.use_f32:
+    print("WARNING: Weights for the convolution op is always saved in f16, as the convolution op in GGML does not support 32-bit kernel weights yet.")
+
+# output in the same directory as the model if output_dir is None
+dir_model = args.model_dir
+
+
+with open(dir_model + "/vocab.json", "r", encoding="utf-8") as f:
+    vocab = json.load(f)
+    tokens = [key for key in vocab]
+
+with open(dir_model + "/config.json", "r", encoding="utf-8") as f:
+    config = json.load(f)
+    v_hparams = config["vision_config"]
+    t_hparams = config["text_config"]
+
+# possible data types
+#   ftype == 0 -> float32
+#   ftype == 1 -> float16
+#
+# map from ftype to string
+ftype_str = ["f32", "f16"]
+
+ftype = 1
+if args.use_f32:
+    ftype = 0
+
+
+model = CLIPModel.from_pretrained(dir_model)
+processor = CLIPProcessor.from_pretrained(dir_model)
+
+fname_middle = None
+has_text_encoder = True
+has_vision_encoder = True
+has_llava_projector = False
+if args.text_only:
+    fname_middle = "text-"
+    has_vision_encoder = False
+elif args.vision_only:
+    fname_middle = "vision-"
+    has_text_encoder = False
+elif args.llava_projector is not None:
+    fname_middle = "mmproj-"
+    has_text_encoder = False
+    has_llava_projector = True
+else:
+    fname_middle = ""
+
+output_dir = args.output_dir if args.output_dir is not None else dir_model
+os.makedirs(output_dir, exist_ok=True)
+output_prefix = os.path.basename(output_dir).replace("ggml_", "")
+fname_out = os.path.join(output_dir, f"{fname_middle}model-{ftype_str[ftype]}.gguf")
+fout = GGUFWriter(path=fname_out, arch="clip")
+
+fout.add_bool("clip.has_text_encoder", has_text_encoder)
+fout.add_bool("clip.has_vision_encoder", has_vision_encoder)
+fout.add_bool("clip.has_llava_projector", has_llava_projector)
+fout.add_file_type(ftype)
+model_name = config["_name_or_path"] if "_name_or_path" in config else os.path.basename(dir_model)
+fout.add_name(model_name)
+if args.text_only:
+    fout.add_description("text-only CLIP model")
+elif args.vision_only and not has_llava_projector:
+    fout.add_description("vision-only CLIP model")
+elif has_llava_projector:
+    fout.add_description("image encoder for LLaVA")
+else:
+    fout.add_description("two-tower CLIP model")
+
+if has_text_encoder:
+    # text_model hparams
+    fout.add_uint32(k(KEY_CONTEXT_LENGTH, TEXT), t_hparams["max_position_embeddings"])
+    fout.add_uint32(k(KEY_EMBEDDING_LENGTH, TEXT), t_hparams["hidden_size"])
+    fout.add_uint32(k(KEY_FEED_FORWARD_LENGTH, TEXT), t_hparams["intermediate_size"])
+    fout.add_uint32("clip.text.projection_dim", t_hparams.get("projection_dim", config["projection_dim"]))
+    fout.add_uint32(k(KEY_ATTENTION_HEAD_COUNT, TEXT), t_hparams["num_attention_heads"])
+    fout.add_float32(k(KEY_ATTENTION_LAYERNORM_EPS, TEXT), t_hparams["layer_norm_eps"])
+    fout.add_uint32(k(KEY_BLOCK_COUNT, TEXT), t_hparams["num_hidden_layers"])
+    fout.add_token_list(tokens)
+
+if has_vision_encoder:
+    # vision_model hparams
+    fout.add_uint32("clip.vision.image_size", v_hparams["image_size"])
+    fout.add_uint32("clip.vision.patch_size", v_hparams["patch_size"])
+    fout.add_uint32(k(KEY_EMBEDDING_LENGTH, VISION), v_hparams["hidden_size"])
+    fout.add_uint32(k(KEY_FEED_FORWARD_LENGTH, VISION), v_hparams["intermediate_size"])
+    fout.add_uint32("clip.vision.projection_dim", v_hparams.get("projection_dim", config["projection_dim"]))
+    fout.add_uint32(k(KEY_ATTENTION_HEAD_COUNT, VISION), v_hparams["num_attention_heads"])
+    fout.add_float32(k(KEY_ATTENTION_LAYERNORM_EPS, VISION), v_hparams["layer_norm_eps"])
+    block_count = v_hparams["num_hidden_layers"] - 1 if has_llava_projector else v_hparams["num_hidden_layers"]
+    fout.add_uint32(k(KEY_BLOCK_COUNT, VISION), block_count)
+
+    image_mean = processor.image_processor.image_mean if args.image_mean is None else args.image_mean
+    image_std = processor.image_processor.image_std if args.image_std is None else args.image_std
+    fout.add_array("clip.vision.image_mean", image_mean)
+    fout.add_array("clip.vision.image_std", image_std)
+
+use_gelu = v_hparams["hidden_act"] == "gelu"
+fout.add_bool("clip.use_gelu", use_gelu)
+
+
+if has_llava_projector:
+    model.vision_model.encoder.layers.pop(-1)
+    projector = torch.load(args.llava_projector)
+    for name, data in projector.items():
+        name = get_tensor_name(name)
+        if data.ndim == 2:
+            data = data.squeeze().numpy().astype(np.float16)
+        else:
+            data = data.squeeze().numpy().astype(np.float32)
+
+        fout.add_tensor(name, data)
+
+    print("Projector tensors added\n")
+
+state_dict = model.state_dict()
+for name, data in state_dict.items():
+    if should_skip_tensor(name, has_text_encoder, has_vision_encoder, has_llava_projector):
+        # we don't need this
+        print(f"skipping parameter: {name}")
+        continue
+
+    name = get_tensor_name(name)
+    data = data.squeeze().numpy()
+
+    n_dims = len(data.shape)
+
+    # ftype == 0 -> float32, ftype == 1 -> float16
+    ftype_cur = 0
+    if n_dims == 4:
+        print(f"tensor {name} is always saved in f16")
+        data = data.astype(np.float16)
+        ftype_cur = 1
+    elif ftype == 1:
+        if name[-7:] == ".weight" and n_dims == 2:
+            print("  Converting to float16")
+            data = data.astype(np.float16)
+            ftype_cur = 1
+        else:
+            print("  Converting to float32")
+            data = data.astype(np.float32)
+            ftype_cur = 0
+    else:
+        if data.dtype != np.float32:
+            print("  Converting to float32")
+            data = data.astype(np.float32)
+            ftype_cur = 0
+
+    print(f"{name} - {ftype_str[ftype_cur]} - shape = {data.shape}")
+    fout.add_tensor(name, data)
+
+
+fout.write_header_to_file()
+fout.write_kv_data_to_file()
+fout.write_tensors_to_file()
+fout.close()
+
+print("Done. Output file: " + fname_out)
diff --git a/examples/llava/llava-surgery.py b/examples/llava/llava-surgery.py
new file mode 100644
index 000000000..515f6b58d
--- /dev/null
+++ b/examples/llava/llava-surgery.py
@@ -0,0 +1,46 @@
+import argparse
+import glob
+import os
+import torch
+
+
+ap = argparse.ArgumentParser()
+ap.add_argument("-m", "--model", help="Path to LLaVA v1.5 model")
+args = ap.parse_args()
+
+# find the model part that includes the the multimodal projector weights
+path = sorted(glob.glob(f"{args.model}/pytorch_model*.bin"))[-1]
+checkpoint = torch.load(path)
+
+# get a list of mm tensor names
+mm_tensors = [k for k, v in checkpoint.items() if k.startswith("model.mm_projector")]
+
+# store these tensors in a new dictionary and torch.save them
+projector = {name: checkpoint[name].float() for name in mm_tensors}
+torch.save(projector, f"{args.model}/llava.projector")
+
+# remove these tensors from the checkpoint and save it again
+for name in mm_tensors:
+    del checkpoint[name]
+
+# BakLLaVA models contain CLIP tensors in it
+clip_tensors = [k for k, v in checkpoint.items() if k.startswith("model.vision_tower")]
+if len(clip_tensors) > 0:
+    clip = {name.replace("vision_tower.vision_tower.", ""): checkpoint[name].float() for name in clip_tensors}
+    torch.save(clip, f"{args.model}/llava.clip")
+
+    # remove these tensors
+    for name in clip_tensors:
+        del checkpoint[name]
+
+    # added tokens should be removed to be able to convert Mistral models
+    if os.path.exists(f"{args.model}/added_tokens.json"):
+        with open(f"{args.model}/added_tokens.json", "w") as f:
+            f.write("{}\n")
+
+
+torch.save(checkpoint, path)
+
+print("Done!")
+print(f"Now you can convert {args.model} to a a regular LLaMA GGUF file.")
+print(f"Also, use {args.model}/llava.projector to prepare a llava-encoder.gguf file.")
diff --git a/examples/llava/llava-utils.h b/examples/llava/llava-utils.h
new file mode 100644
index 000000000..320c71967
--- /dev/null
+++ b/examples/llava/llava-utils.h
@@ -0,0 +1,147 @@
+#pragma once
+
+// this one and clip lib will be eventually merged to a single lib, let's keep it this way for now
+
+#include "common.h"
+#include "llama.h"
+
+#include <cstdio>
+#include <cstdlib>
+#include <vector>
+
+inline bool eval_image_embd(llama_context * ctx_llama, float * embd, int N, int n_batch, int * n_past) {
+    int n_embd  = llama_n_embd(llama_get_model(ctx_llama));
+
+    for (int i = 0; i < N; i += n_batch) {
+        int n_eval = N - i;
+        if (n_eval > n_batch) {
+            n_eval = n_batch;
+        }
+        llama_batch batch = {int32_t(n_eval), nullptr, (embd+i*n_embd), nullptr, nullptr, nullptr, nullptr, *n_past, 1, 0, };
+        if (llama_decode(ctx_llama, batch)) {
+            fprintf(stderr, "%s : failed to eval\n", __func__);
+            return false;
+        }
+        *n_past += n_eval;
+    }
+    return true;
+}
+
+inline bool eval_tokens(struct llama_context * ctx_llama, std::vector<llama_token> tokens, int n_batch, int * n_past) {
+    int N = (int) tokens.size();
+    for (int i = 0; i < N; i += n_batch) {
+        int n_eval = (int) tokens.size() - i;
+        if (n_eval > n_batch) {
+            n_eval = n_batch;
+        }
+        if (llama_decode(ctx_llama, llama_batch_get_one(&tokens[i], n_eval, *n_past, 0))) {
+            fprintf(stderr, "%s : failed to eval\n", __func__);
+            return false;
+        }
+        *n_past += n_eval;
+    }
+    return true;
+}
+
+inline bool eval_id(struct llama_context * ctx_llama, int id, int * n_past) {
+    std::vector<llama_token> tokens;
+    tokens.push_back(id);
+    return eval_tokens(ctx_llama, tokens, 1, n_past);
+}
+
+inline bool eval_string(struct llama_context * ctx_llama, const char* str, int n_batch, int * n_past, bool add_bos){
+    std::string              str2     = str;
+    std::vector<llama_token> embd_inp = ::llama_tokenize(ctx_llama, str2, add_bos);
+    eval_tokens(ctx_llama, embd_inp, n_batch, n_past);
+    return true;
+}
+
+// TODO: use common/sampling.h
+inline llama_token sample_id(llama_context * ctx_llama, gpt_params & params) {
+    auto & sparams = params.sparams;
+
+    // out of user input, sample next token
+    const float   temp      = sparams.temp;
+    const int32_t top_k     = sparams.top_k <= 0 ? llama_n_vocab(llama_get_model(ctx_llama)) : sparams.top_k;
+    const float   top_p     = sparams.top_p;
+    const float   tfs_z     = sparams.tfs_z;
+    const float   typical_p = sparams.typical_p;
+    // const int32_t repeat_last_n   = sparams.repeat_last_n < 0 ? n_ctx : sparams.repeat_last_n;
+    // const float   repeat_penalty  = sparams.repeat_penalty;
+    // const float   alpha_presence  = sparams.presence_penalty;
+    // const float   alpha_frequency = sparams.frequency_penalty;
+    const int     mirostat     = sparams.mirostat;
+    const float   mirostat_tau = sparams.mirostat_tau;
+    const float   mirostat_eta = sparams.mirostat_eta;
+    // const bool    penalize_nl     = sparams.penalize_nl;
+
+    llama_token id = 0;
+    {
+        auto logits  = llama_get_logits(ctx_llama);
+        auto n_vocab = llama_n_vocab(llama_get_model(ctx_llama));
+
+        // Apply params.logit_bias map
+        for (auto it = sparams.logit_bias.begin(); it != sparams.logit_bias.end(); it++) {
+            logits[it->first] += it->second;
+        }
+
+        std::vector<llama_token_data> candidates;
+        candidates.reserve(n_vocab);
+        for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
+            candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
+        }
+
+        llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
+
+        // TODO: Apply penalties
+        // float nl_logit = logits[llama_token_nl(ctx)];
+        // auto last_n_repeat = std::min(std::min((int)last_n_tokens.size(), repeat_last_n), n_ctx);
+        // llama_sample_repetition_penalty(ctx, &candidates_p,
+        //      last_n_tokens.data() + last_n_tokens.size() - last_n_repeat,
+        //      last_n_repeat, repeat_penalty);
+        // llama_sample_frequency_and_presence_penalties(ctx, &candidates_p,
+        // last_n_tokens.data() + last_n_tokens.size() - last_n_repeat,
+        // last_n_repeat, alpha_frequency, alpha_presence);
+        // if (!penalize_nl) {
+        //     logits[llama_token_nl(ctx)] = nl_logit;
+        // }
+
+        if (temp <= 0) {
+              // Greedy sampling
+            id = llama_sample_token_greedy(ctx_llama, &candidates_p);
+        } else {
+            if (mirostat == 1) {
+                static float mirostat_mu = 2.0f * mirostat_tau;
+                const  int mirostat_m    = 100;
+                llama_sample_temp(ctx_llama, &candidates_p, temp);
+                id = llama_sample_token_mirostat(ctx_llama, &candidates_p, mirostat_tau, mirostat_eta, mirostat_m, &mirostat_mu);
+            } else if (mirostat == 2) {
+                static float mirostat_mu = 2.0f * mirostat_tau;
+                llama_sample_temp(ctx_llama, &candidates_p, temp);
+                id = llama_sample_token_mirostat_v2(ctx_llama, &candidates_p, mirostat_tau, mirostat_eta, &mirostat_mu);
+            } else {
+                  // Temperature sampling
+                llama_sample_top_k(ctx_llama, &candidates_p, top_k, 1);
+                llama_sample_tail_free(ctx_llama, &candidates_p, tfs_z, 1);
+                llama_sample_typical(ctx_llama, &candidates_p, typical_p, 1);
+                llama_sample_top_p(ctx_llama, &candidates_p, top_p, 1);
+                llama_sample_temp(ctx_llama, &candidates_p, temp);
+                id = llama_sample_token(ctx_llama, &candidates_p);
+            }
+        }
+    }
+
+    return id;
+}
+
+inline const char * sample(struct llama_context * ctx_llama, gpt_params & params, int * n_past) {
+    int id = sample_id(ctx_llama, params);
+    static std::string ret;
+    if (id == llama_token_eos(llama_get_model(ctx_llama))) {
+        ret = "</s>";
+    } else {
+        ret = llama_token_to_piece(ctx_llama, id);
+    }
+    eval_id(ctx_llama, id, n_past);
+    return ret.c_str();
+}
diff --git a/examples/llava/llava.cpp b/examples/llava/llava.cpp
new file mode 100644
index 000000000..f0974d5bc
--- /dev/null
+++ b/examples/llava/llava.cpp
@@ -0,0 +1,164 @@
+#include "clip.h"
+#include "llava-utils.h"
+#include "common.h"
+#include "llama.h"
+
+#include <cstdio>
+#include <cstdlib>
+#include <vector>
+
+static void show_additional_info(int /*argc*/, char ** argv) {
+    printf("\n example usage: %s -m <llava-v1.5-7b/ggml-model-q5_k.gguf> --mmproj <llava-v1.5-7b/mmproj-model-f16.gguf> --image <path/to/an/image.jpg> [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]);
+    printf("  note: a lower temperature value like 0.1 is recommended for better quality.\n");
+}
+
+int main(int argc, char ** argv) {
+    ggml_time_init();
+
+    gpt_params params;
+
+    if (!gpt_params_parse(argc, argv, params)) {
+        show_additional_info(argc, argv);
+        return 1;
+    }
+
+    if (params.mmproj.empty() || params.image.empty()) {
+        gpt_print_usage(argc, argv, params);
+        show_additional_info(argc, argv);
+        return 1;
+    }
+
+    const char * clip_path = params.mmproj.c_str();
+    const char * img_path = params.image.c_str();
+
+    if (params.prompt.empty()) {
+        params.prompt = "describe the image in detail.";
+    }
+
+    auto ctx_clip = clip_model_load(clip_path, /*verbosity=*/ 1);
+
+    // load and preprocess the image
+    clip_image_u8 img;
+    clip_image_f32 img_res;
+
+    if (!clip_image_load_from_file(img_path, &img)) {
+        fprintf(stderr, "%s: is %s really an image file?\n", __func__, img_path);
+
+        clip_free(ctx_clip);
+        return 1;
+    }
+
+    if (!clip_image_preprocess(ctx_clip, &img, &img_res, /*pad2square =*/ true)) {
+        fprintf(stderr, "%s: unable to preprocess %s\n", __func__, img_path);
+
+        clip_free(ctx_clip);
+        return 1;
+    }
+
+    int n_img_pos  = clip_n_patches(ctx_clip);
+    int n_img_embd = clip_n_mmproj_embd(ctx_clip);
+
+    float * image_embd = (float *)malloc(clip_embd_nbytes(ctx_clip));
+
+    if (!image_embd) {
+        fprintf(stderr, "Unable to allocate memory for image embeddings\n");
+
+        return 1;
+    }
+
+    const int64_t t_img_enc_start_us = ggml_time_us();
+    if (!clip_image_encode(ctx_clip, params.n_threads, &img_res, image_embd)) {
+        fprintf(stderr, "Unable to encode image\n");
+
+        return 1;
+    }
+    const int64_t t_img_enc_end_us = ggml_time_us();
+
+    // we get the embeddings, free up the memory required for CLIP
+    clip_free(ctx_clip);
+
+    llama_backend_init(params.numa);
+
+    llama_model_params model_params              = llama_model_default_params();
+                       model_params.n_gpu_layers = params.n_gpu_layers;
+                       model_params.main_gpu     = params.main_gpu;
+                       model_params.tensor_split = params.tensor_split;
+                       model_params.use_mmap     = params.use_mmap;
+                       model_params.use_mlock    = params.use_mlock;
+
+    llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params);
+    if (model == NULL) {
+        fprintf(stderr , "%s: error: unable to load model\n" , __func__);
+        return 1;
+    }
+
+    llama_context_params ctx_params = llama_context_default_params();
+
+    ctx_params.n_ctx           = params.n_ctx < 2048 ? 2048 : params.n_ctx; // we need a longer context size to process image embeddings
+    ctx_params.n_threads       = params.n_threads;
+    ctx_params.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
+    ctx_params.seed            = params.seed;
+
+    llama_context * ctx_llama = llama_new_context_with_model(model, ctx_params);
+
+    if (ctx_llama == NULL) {
+        fprintf(stderr , "%s: error: failed to create the llama_context\n" , __func__);
+        return 1;
+    }
+
+    // make sure that the correct mmproj was used, i.e., compare apples to apples
+    const int n_llama_embd = llama_n_embd(llama_get_model(ctx_llama));
+
+    if (n_img_embd != n_llama_embd) {
+        printf("%s: embedding dim of the multimodal projector (%d) is not equal to that of LLaMA (%d). Make sure that you use the correct mmproj file.\n", __func__, n_img_embd, n_llama_embd);
+
+        llama_free(ctx_llama);
+        llama_free_model(model);
+        llama_backend_free();
+        free(image_embd);
+
+        return 1;
+    }
+
+    // process the prompt
+    // llava chat format is "<system_prompt>USER: <image_embeddings>\n<textual_prompt>\nASSISTANT:"
+
+    int n_past = 0;
+
+    const int max_tgt_len = params.n_predict < 0 ? 256 : params.n_predict;
+
+    eval_string(ctx_llama, "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\nUSER:", params.n_batch, &n_past, true);
+    eval_image_embd(ctx_llama, image_embd, n_img_pos, params.n_batch, &n_past);
+    eval_string(ctx_llama, (params.prompt + "\nASSISTANT:").c_str(), params.n_batch, &n_past, false);
+
+    // generate the response
+
+    printf("\n");
+    printf("prompt: '%s'\n", params.prompt.c_str());
+    printf("\n");
+
+    for (int i = 0; i < max_tgt_len; i++) {
+        const char * tmp = sample(ctx_llama, params, &n_past);
+        if (strcmp(tmp, "</s>") == 0) break;
+
+        printf("%s", tmp);
+        fflush(stdout);
+    }
+
+    printf("\n");
+
+    {
+        const float t_img_enc_ms = (t_img_enc_end_us - t_img_enc_start_us) / 1000.0;
+
+        printf("\n%s: image encoded in %8.2f ms by CLIP (%8.2f ms per image patch)\n", __func__, t_img_enc_ms, t_img_enc_ms / n_img_pos);
+    }
+
+    llama_print_timings(ctx_llama);
+
+    llama_free(ctx_llama);
+    llama_free_model(model);
+    llama_backend_free();
+    free(image_embd);
+
+    return 0;
+}
diff --git a/examples/main-cmake-pkg/CMakeLists.txt b/examples/main-cmake-pkg/CMakeLists.txt
index 908131884..cb00edbbb 100644
--- a/examples/main-cmake-pkg/CMakeLists.txt
+++ b/examples/main-cmake-pkg/CMakeLists.txt
@@ -16,6 +16,8 @@ add_library(common OBJECT
     ${_common_path}/console.cpp
     ${_common_path}/grammar-parser.h
     ${_common_path}/grammar-parser.cpp
+    ${_common_path}/sampling.h
+    ${_common_path}/sampling.cpp
     )
 
 # WARNING: because build-info.h is auto-generated, it will only
diff --git a/examples/main/CMakeLists.txt b/examples/main/CMakeLists.txt
index cc1888948..d532980b7 100644
--- a/examples/main/CMakeLists.txt
+++ b/examples/main/CMakeLists.txt
@@ -3,6 +3,3 @@ add_executable(${TARGET} main.cpp)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
-if(TARGET BUILD_INFO)
-  add_dependencies(${TARGET} BUILD_INFO)
-endif()
diff --git a/examples/main/README.md b/examples/main/README.md
index a9561c383..a3428b487 100644
--- a/examples/main/README.md
+++ b/examples/main/README.md
@@ -208,6 +208,14 @@ Top-p sampling, also known as nucleus sampling, is another text generation metho
 
 Example usage: `--top-p 0.95`
 
+### Min P Sampling
+
+-   `--min-p N`: Sets a minimum base probability threshold for token selection (default: 0.05).
+
+The Min-P sampling method was designed as an alternative to Top-P, and aims to ensure a balance of quality and variety. The parameter *p* represents the minimum probability for a token to be considered, relative to the probability of the most likely token. For example, with *p*=0.05 and the most likely token having a probability of 0.9, logits with a value less than 0.045 are filtered out.
+
+Example usage: `--min-p 0.05`
+
 ### Tail Free Sampling (TFS)
 
 -   `--tfs N`: Enable tail free sampling with parameter z (default: 1.0, 1.0 = disabled).
diff --git a/examples/main/main.cpp b/examples/main/main.cpp
index 55f73356f..8d985c82a 100644
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@@ -2,8 +2,6 @@
 
 #include "console.h"
 #include "llama.h"
-#include "build-info.h"
-#include "grammar-parser.h"
 
 #include <cassert>
 #include <cinttypes>
@@ -109,7 +107,7 @@ int main(int argc, char ** argv) {
     if (!gpt_params_parse(argc, argv, params)) {
         return 1;
     }
-    llama_sampling_params & sparams = params.sampling_params;
+    llama_sampling_params & sparams = params.sparams;
 
 #ifndef LOG_DISABLE_LOGS
     log_set_target(log_filename_generator("main", "log"));
@@ -154,8 +152,8 @@ int main(int argc, char ** argv) {
         LOG_TEE("%s: warning: scaling RoPE frequency by %g.\n", __func__, params.rope_freq_scale);
     }
 
-    LOG_TEE("%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT);
-    LOG_TEE("%s: built with %s for %s\n", __func__, BUILD_COMPILER, BUILD_TARGET);
+    LOG_TEE("%s: build = %d (%s)\n",      __func__, LLAMA_BUILD_NUMBER, LLAMA_COMMIT);
+    LOG_TEE("%s: built with %s for %s\n", __func__, LLAMA_COMPILER, LLAMA_BUILD_TARGET);
 
     if (params.seed == LLAMA_DEFAULT_SEED) {
         params.seed = time(NULL);
@@ -238,19 +236,19 @@ int main(int argc, char ** argv) {
 
     if (params.interactive_first || params.instruct || !params.prompt.empty() || session_tokens.empty()) {
         LOG("tokenize the prompt\n");
-        embd_inp = ::llama_tokenize(ctx, params.prompt, add_bos);
+        embd_inp = ::llama_tokenize(ctx, params.prompt, add_bos, true);
     } else {
         LOG("use session tokens\n");
         embd_inp = session_tokens;
     }
 
     LOG("prompt: \"%s\"\n", log_tostr(params.prompt));
-    LOG("tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp));
+    LOG("tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp).c_str());
 
     // Should not run without any tokens
     if (embd_inp.empty()) {
-        embd_inp.push_back(llama_token_bos(ctx));
-        LOG("embd_inp was considered empty and bos was added: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp));
+        embd_inp.push_back(llama_token_bos(model));
+        LOG("embd_inp was considered empty and bos was added: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp).c_str());
     }
 
     // Tokenize negative prompt
@@ -260,11 +258,11 @@ int main(int argc, char ** argv) {
     if (ctx_guidance) {
         LOG("cfg_negative_prompt: \"%s\"\n", log_tostr(sparams.cfg_negative_prompt));
 
-        guidance_inp = ::llama_tokenize(ctx_guidance, sparams.cfg_negative_prompt, add_bos);
-        LOG("guidance_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_guidance, guidance_inp));
+        guidance_inp = ::llama_tokenize(ctx_guidance, sparams.cfg_negative_prompt, add_bos, true);
+        LOG("guidance_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_guidance, guidance_inp).c_str());
 
-        std::vector<llama_token> original_inp = ::llama_tokenize(ctx, params.prompt, add_bos);
-        LOG("original_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, original_inp));
+        std::vector<llama_token> original_inp = ::llama_tokenize(ctx, params.prompt, add_bos, true);
+        LOG("original_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, original_inp).c_str());
 
         original_prompt_len = original_inp.size();
         guidance_offset = (int)guidance_inp.size() - original_prompt_len;
@@ -299,7 +297,7 @@ int main(int argc, char ** argv) {
         }
 
         // remove any "future" tokens that we might have inherited from the previous session
-        llama_kv_cache_tokens_rm(ctx, n_matching_session_tokens, -1);
+        llama_kv_cache_seq_rm(ctx, -1, n_matching_session_tokens, -1);
     }
 
     LOGLN(
@@ -320,11 +318,11 @@ int main(int argc, char ** argv) {
     }
 
     // prefix & suffix for instruct mode
-    const auto inp_pfx = ::llama_tokenize(ctx, "\n\n### Instruction:\n\n", add_bos);
-    const auto inp_sfx = ::llama_tokenize(ctx, "\n\n### Response:\n\n",    false);
+    const auto inp_pfx = ::llama_tokenize(ctx, "\n\n### Instruction:\n\n", add_bos, true);
+    const auto inp_sfx = ::llama_tokenize(ctx, "\n\n### Response:\n\n",    false,   true);
 
-    LOG("inp_pfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, inp_pfx));
-    LOG("inp_sfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, inp_sfx));
+    LOG("inp_pfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, inp_pfx).c_str());
+    LOG("inp_sfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, inp_sfx).c_str());
 
     // in instruct mode, we inject a prefix and a suffix to each input by the user
     if (params.instruct) {
@@ -383,6 +381,12 @@ int main(int argc, char ** argv) {
         if (!params.antiprompt.empty()) {
             for (const auto & antiprompt : params.antiprompt) {
                 LOG_TEE("Reverse prompt: '%s'\n", antiprompt.c_str());
+                if (params.verbose_prompt) {
+                    auto tmp = ::llama_tokenize(ctx, antiprompt, false, true);
+                    for (int i = 0; i < (int) tmp.size(); i++) {
+                        LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx, tmp[i]).c_str());
+                    }
+                }
             }
         }
 
@@ -392,45 +396,27 @@ int main(int argc, char ** argv) {
 
         if (!params.input_prefix.empty()) {
             LOG_TEE("Input prefix: '%s'\n", params.input_prefix.c_str());
+            if (params.verbose_prompt) {
+                auto tmp = ::llama_tokenize(ctx, params.input_prefix, true, true);
+                for (int i = 0; i < (int) tmp.size(); i++) {
+                    LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx, tmp[i]).c_str());
+                }
+            }
         }
 
         if (!params.input_suffix.empty()) {
             LOG_TEE("Input suffix: '%s'\n", params.input_suffix.c_str());
-        }
-    }
-    LOG_TEE("sampling: repeat_last_n = %d, repeat_penalty = %f, presence_penalty = %f, frequency_penalty = %f, top_k = %d, tfs_z = %f, top_p = %f, typical_p = %f, temp = %f, mirostat = %d, mirostat_lr = %f, mirostat_ent = %f\n",
-            sparams.repeat_last_n, sparams.repeat_penalty, sparams.presence_penalty, sparams.frequency_penalty, sparams.top_k, sparams.tfs_z, sparams.top_p, sparams.typical_p, sparams.temp, sparams.mirostat, sparams.mirostat_eta, sparams.mirostat_tau);
-    LOG_TEE("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
-    LOG_TEE("\n\n");
-
-    struct llama_grammar * grammar = NULL;
-    grammar_parser::parse_state parsed_grammar;
-
-    if (!params.grammar.empty()) {
-        parsed_grammar = grammar_parser::parse(params.grammar.c_str());
-        // will be empty (default) if there are parse errors
-        if (parsed_grammar.rules.empty()) {
-            return 1;
-        }
-        LOG_TEE("%s: grammar:\n", __func__);
-        grammar_parser::print_grammar(stderr, parsed_grammar);
-        LOG_TEE("\n");
-
-        {
-            auto it = sparams.logit_bias.find(llama_token_eos(ctx));
-            if (it != sparams.logit_bias.end() && it->second == -INFINITY) {
-                LOG_TEE("%s: warning: EOS token is disabled, which will cause most grammars to fail\n", __func__);
+            if (params.verbose_prompt) {
+                auto tmp = ::llama_tokenize(ctx, params.input_suffix, false, true);
+                for (int i = 0; i < (int) tmp.size(); i++) {
+                    LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx, tmp[i]).c_str());
+                }
             }
         }
-
-        std::vector<const llama_grammar_element *> grammar_rules(parsed_grammar.c_rules());
-        grammar = llama_grammar_init(
-            grammar_rules.data(), grammar_rules.size(), parsed_grammar.symbol_ids.at("root"));
     }
-
-    // TODO: replace with ring-buffer
-    std::vector<llama_token> last_tokens(n_ctx);
-    std::fill(last_tokens.begin(), last_tokens.end(), 0);
+    LOG_TEE("sampling: \n%s\n", llama_sampling_print(sparams).c_str());
+    LOG_TEE("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
+    LOG_TEE("\n\n");
 
     if (params.interactive) {
         const char *control_message;
@@ -471,11 +457,7 @@ int main(int argc, char ** argv) {
     std::vector<llama_token> embd;
     std::vector<llama_token> embd_guidance;
 
-    const int n_vocab = llama_n_vocab(model);
-
-    llama_sampling_context ctx_sampling = llama_sampling_context_init(params, grammar);
-    std::vector<llama_token_data> candidates;
-    candidates.reserve(n_vocab);
+    struct llama_sampling_context * ctx_sampling = llama_sampling_init(sparams);
 
     while ((n_remain != 0 && !is_antiprompt) || params.interactive) {
         // predict
@@ -522,7 +504,7 @@ int main(int argc, char ** argv) {
 
                 LOG("after swap: n_past = %d, n_past_guidance = %d\n", n_past, n_past_guidance);
 
-                LOG("embd: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd));
+                LOG("embd: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd).c_str());
 
                 LOG("clear session path\n");
                 path_session.clear();
@@ -552,7 +534,6 @@ int main(int argc, char ** argv) {
 
             // evaluate tokens in batches
             // embd is typically prepared beforehand to fit within a batch, but not always
-
             if (ctx_guidance) {
                 int input_size = 0;
                 llama_token * input_buf = NULL;
@@ -574,7 +555,7 @@ int main(int argc, char ** argv) {
                     input_buf  = embd_guidance.data();
                     input_size = embd_guidance.size();
 
-                    LOG("guidance context: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_guidance));
+                    LOG("guidance context: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_guidance).c_str());
                 } else {
                     input_buf  = embd.data();
                     input_size = embd.size();
@@ -597,7 +578,7 @@ int main(int argc, char ** argv) {
                     n_eval = params.n_batch;
                 }
 
-                LOG("eval: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd));
+                LOG("eval: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd).c_str());
 
                 if (llama_decode(ctx, llama_batch_get_one(&embd[i], n_eval, n_past, 0))) {
                     LOG_TEE("%s : failed to eval\n", __func__);
@@ -627,12 +608,11 @@ int main(int argc, char ** argv) {
                 LOG("saved session to %s\n", path_session.c_str());
             }
 
-            const llama_token id = llama_sampling_sample(ctx, ctx_guidance, ctx_sampling, last_tokens, candidates);
+            const llama_token id = llama_sampling_sample(ctx_sampling, ctx, ctx_guidance);
 
-            last_tokens.erase(last_tokens.begin());
-            last_tokens.push_back(id);
+            llama_sampling_accept(ctx_sampling, ctx, id, true);
 
-            LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, last_tokens));
+            LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, ctx_sampling->prev).c_str());
 
             embd.push_back(id);
 
@@ -648,8 +628,11 @@ int main(int argc, char ** argv) {
             LOG("embd_inp.size(): %d, n_consumed: %d\n", (int) embd_inp.size(), n_consumed);
             while ((int) embd_inp.size() > n_consumed) {
                 embd.push_back(embd_inp[n_consumed]);
-                last_tokens.erase(last_tokens.begin());
-                last_tokens.push_back(embd_inp[n_consumed]);
+
+                // push the prompt in the sampling context in order to apply repetition penalties later
+                // for the prompt, we don't apply grammar rules
+                llama_sampling_accept(ctx_sampling, ctx, embd_inp[n_consumed], false);
+
                 ++n_consumed;
                 if ((int) embd.size() >= params.n_batch) {
                     break;
@@ -679,12 +662,10 @@ int main(int argc, char ** argv) {
 
         // if not currently processing queued inputs;
         if ((int) embd_inp.size() <= n_consumed) {
-            // check for reverse prompt
+            // check for reverse prompt in the last n_prev tokens
             if (!params.antiprompt.empty()) {
-                std::string last_output;
-                for (auto id : last_tokens) {
-                    last_output += llama_token_to_piece(ctx, id);
-                }
+                const int n_prev = 32;
+                const std::string last_output = llama_sampling_prev_str(ctx_sampling, ctx, n_prev);
 
                 is_antiprompt = false;
                 // Check if each of the reverse prompts appears at the end of the output.
@@ -711,13 +692,13 @@ int main(int argc, char ** argv) {
             }
 
             // deal with end of text token in interactive mode
-            if (last_tokens.back() == llama_token_eos(ctx)) {
+            if (llama_sampling_last(ctx_sampling) == llama_token_eos(model)) {
                 LOG("found EOS token\n");
 
                 if (params.interactive) {
                     if (!params.antiprompt.empty()) {
                         // tokenize and inject first reverse prompt
-                        const auto first_antiprompt = ::llama_tokenize(ctx, params.antiprompt.front(), false);
+                        const auto first_antiprompt = ::llama_tokenize(ctx, params.antiprompt.front(), false, true);
                         embd_inp.insert(embd_inp.end(), first_antiprompt.begin(), first_antiprompt.end());
                         is_antiprompt = true;
                     }
@@ -738,14 +719,13 @@ int main(int argc, char ** argv) {
 
                 if (params.input_prefix_bos) {
                     LOG("adding input prefix BOS token\n");
-                    embd_inp.push_back(llama_token_bos(ctx));
+                    embd_inp.push_back(llama_token_bos(model));
                 }
 
                 std::string buffer;
                 if (!params.input_prefix.empty()) {
                     LOG("appending input prefix: '%s'\n", params.input_prefix.c_str());
-                    buffer += params.input_prefix;
-                    printf("%s", buffer.c_str());
+                    printf("%s", params.input_prefix.c_str());
                 }
 
                 // color user input only
@@ -767,7 +747,6 @@ int main(int argc, char ** argv) {
                     // append input suffix if any
                     if (!params.input_suffix.empty()) {
                         LOG("appending input suffix: '%s'\n", params.input_suffix.c_str());
-                        buffer += params.input_suffix;
                         printf("%s", params.input_suffix.c_str());
                     }
 
@@ -781,11 +760,18 @@ int main(int argc, char ** argv) {
                         n_consumed = embd_inp.size();
                         embd_inp.insert(embd_inp.end(), inp_pfx.begin(), inp_pfx.end());
                     }
+                    if (params.escape) {
+                        process_escapes(buffer);
+                    }
 
-                    const auto line_inp = ::llama_tokenize(ctx, buffer, false);
-                    LOG("input tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, line_inp));
+                    const auto line_pfx = ::llama_tokenize(ctx, params.input_prefix, false, true);
+                    const auto line_inp = ::llama_tokenize(ctx, buffer,              false, false);
+                    const auto line_sfx = ::llama_tokenize(ctx, params.input_suffix, false, true);
+                    LOG("input tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, line_inp).c_str());
 
+                    embd_inp.insert(embd_inp.end(), line_pfx.begin(), line_pfx.end());
                     embd_inp.insert(embd_inp.end(), line_inp.begin(), line_inp.end());
+                    embd_inp.insert(embd_inp.end(), line_sfx.begin(), line_sfx.end());
 
                     // instruct mode: insert response suffix
                     if (params.instruct) {
@@ -810,22 +796,14 @@ int main(int argc, char ** argv) {
 
             if (n_past > 0) {
                 if (is_interacting) {
-                    // reset grammar state if we're restarting generation
-                    if (grammar != NULL) {
-                        llama_grammar_free(grammar);
-
-                        std::vector<const llama_grammar_element *> grammar_rules(parsed_grammar.c_rules());
-                        grammar = llama_grammar_init(
-                            grammar_rules.data(), grammar_rules.size(),
-                            parsed_grammar.symbol_ids.at("root"));
-                    }
+                    llama_sampling_reset(ctx_sampling);
                 }
                 is_interacting = false;
             }
         }
 
         // end of text token
-        if (!embd.empty() && embd.back() == llama_token_eos(ctx) && !(params.instruct || params.interactive)) {
+        if (!embd.empty() && embd.back() == llama_token_eos(model) && !(params.instruct || params.interactive)) {
             LOG_TEE(" [end of text]\n");
             break;
         }
@@ -850,9 +828,7 @@ int main(int argc, char ** argv) {
     llama_free(ctx);
     llama_free_model(model);
 
-    if (grammar != NULL) {
-        llama_grammar_free(grammar);
-    }
+    llama_sampling_free(ctx_sampling);
     llama_backend_free();
 
 #ifndef LOG_DISABLE_LOGS
diff --git a/examples/parallel/CMakeLists.txt b/examples/parallel/CMakeLists.txt
index 0bbf89eae..319535a6e 100644
--- a/examples/parallel/CMakeLists.txt
+++ b/examples/parallel/CMakeLists.txt
@@ -3,6 +3,3 @@ add_executable(${TARGET} parallel.cpp)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
-if(TARGET BUILD_INFO)
-  add_dependencies(${TARGET} BUILD_INFO)
-endif()
diff --git a/examples/parallel/parallel.cpp b/examples/parallel/parallel.cpp
index 63ddcd8ed..a78df305f 100644
--- a/examples/parallel/parallel.cpp
+++ b/examples/parallel/parallel.cpp
@@ -1,8 +1,6 @@
 // A basic application simulating a server with multiple clients.
 // The clients submite requests to the server and they are processed in parallel.
 
-#include "build-info.h"
-
 #include "common.h"
 #include "llama.h"
 
@@ -51,6 +49,12 @@ static std::vector<std::string> k_prompts = {
 };
 
 struct client {
+    ~client() {
+        if (ctx_sampling) {
+            llama_sampling_free(ctx_sampling);
+        }
+    }
+
     int32_t id = 0;
 
     llama_seq_id seq_id = -1;
@@ -68,7 +72,7 @@ struct client {
     std::string prompt;
     std::string response;
 
-    std::vector<llama_token> tokens_prev;
+    struct llama_sampling_context * ctx_sampling = nullptr;
 };
 
 static void print_date_time() {
@@ -125,8 +129,6 @@ int main(int argc, char ** argv) {
     params.logits_all = true;
     std::tie(model, ctx) = llama_init_from_gpt_params(params);
 
-    llama_sampling_context ctx_sampling = llama_sampling_context_init(params, NULL);
-
     // load the prompts from an external file if there are any
     if (params.prompt.empty()) {
         printf("\n\033[32mNo new questions so proceed with build-in defaults.\033[0m\n");
@@ -147,20 +149,15 @@ int main(int argc, char ** argv) {
     fprintf(stderr, "\n\n");
     fflush(stderr);
 
-    const int n_ctx   = llama_n_ctx(ctx);
-    const int n_vocab = llama_n_vocab(model);
+    const int n_ctx = llama_n_ctx(ctx);
 
     std::vector<client> clients(n_clients);
     for (size_t i = 0; i < clients.size(); ++i) {
         auto & client = clients[i];
         client.id = i;
-        client.tokens_prev.resize(std::max(256, params.n_predict));
-        std::fill(client.tokens_prev.begin(), client.tokens_prev.end(), 0);
+        client.ctx_sampling = llama_sampling_init(params.sparams);
     }
 
-    std::vector<llama_token_data> candidates;
-    candidates.reserve(n_vocab);
-
     std::vector<llama_token> tokens_system;
     tokens_system = ::llama_tokenize(ctx, k_system, true);
     const int32_t n_tokens_system = tokens_system.size();
@@ -169,7 +166,7 @@ int main(int argc, char ** argv) {
 
     // the max batch size is as large as the context to handle cases where we get very long input prompt from multiple
     // users. regardless of the size, the main loop will chunk the batch into a maximum of params.n_batch tokens at a time
-    llama_batch batch = llama_batch_init(n_ctx, 0);
+    llama_batch batch = llama_batch_init(n_ctx, 0, 1);
 
     int32_t n_total_prompt = 0;
     int32_t n_total_gen    = 0;
@@ -184,13 +181,8 @@ int main(int argc, char ** argv) {
     {
         LOG_TEE("%s: Evaluating the system prompt ...\n", __func__);
 
-        batch.n_tokens = n_tokens_system;
-
-        for (int32_t i = 0; i < batch.n_tokens; ++i) {
-            batch.token[i]  = tokens_system[i];
-            batch.pos[i]    = i;
-            batch.seq_id[i] = 0;
-            batch.logits[i] = false;
+        for (int32_t i = 0; i < n_tokens_system; ++i) {
+            llama_batch_add(batch, tokens_system[i], i, { 0 }, false);
         }
 
         if (llama_decode(ctx, batch) != 0) {
@@ -209,7 +201,7 @@ int main(int argc, char ** argv) {
     LOG_TEE("Processing requests ...\n\n");
 
     while (true) {
-        batch.n_tokens = 0;
+        llama_batch_clear(batch);
 
         // decode any currently ongoing sequences
         for (auto & client : clients) {
@@ -217,15 +209,11 @@ int main(int argc, char ** argv) {
                 continue;
             }
 
-            batch.token [batch.n_tokens] = client.sampled;
-            batch.pos   [batch.n_tokens] = n_tokens_system + client.n_prompt + client.n_decoded;
-            batch.seq_id[batch.n_tokens] = client.id;
-            batch.logits[batch.n_tokens] = true;
-
-            client.n_decoded += 1;
             client.i_batch = batch.n_tokens;
 
-            batch.n_tokens += 1;
+            llama_batch_add(batch, client.sampled, n_tokens_system + client.n_prompt + client.n_decoded, { client.id }, true);
+
+            client.n_decoded += 1;
         }
 
         if (batch.n_tokens == 0) {
@@ -250,18 +238,14 @@ int main(int argc, char ** argv) {
                     client.prompt   = client.input + "\nAssistant:";
                     client.response = "";
 
-                    std::fill(client.tokens_prev.begin(), client.tokens_prev.end(), 0);
+                    llama_sampling_reset(client.ctx_sampling);
 
                     // do not prepend BOS because we have a system prompt!
                     std::vector<llama_token> tokens_prompt;
                     tokens_prompt = ::llama_tokenize(ctx, client.prompt, false);
 
                     for (size_t i = 0; i < tokens_prompt.size(); ++i) {
-                        batch.token [batch.n_tokens] = tokens_prompt[i];
-                        batch.pos   [batch.n_tokens] = i + n_tokens_system;
-                        batch.seq_id[batch.n_tokens] = client.id;
-                        batch.logits[batch.n_tokens] = false;
-                        batch.n_tokens += 1;
+                        llama_batch_add(batch, tokens_prompt[i], i + n_tokens_system, { client.id }, false);
                     }
 
                     // extract the logits only for the last token
@@ -304,11 +288,12 @@ int main(int argc, char ** argv) {
 
             llama_batch batch_view = {
                 n_tokens,
-                batch.token  + i,
+                batch.token    + i,
                 nullptr,
-                batch.pos    + i,
-                batch.seq_id + i,
-                batch.logits + i,
+                batch.pos      + i,
+                batch.n_seq_id + i,
+                batch.seq_id   + i,
+                batch.logits   + i,
                 0, 0, 0, // unused
             };
 
@@ -341,7 +326,9 @@ int main(int argc, char ** argv) {
                 //printf("client %d, seq %d, token %d, pos %d, batch %d\n",
                 //        client.id, client.seq_id, client.sampled, client.n_decoded, client.i_batch);
 
-                const llama_token id = llama_sampling_sample(ctx, NULL, ctx_sampling, client.tokens_prev, candidates, client.i_batch - i, client.seq_id);
+                const llama_token id = llama_sampling_sample(client.ctx_sampling, ctx, NULL, client.i_batch - i);
+
+                llama_sampling_accept(client.ctx_sampling, ctx, id, true);
 
                 if (client.n_decoded == 1) {
                     // start measuring generation time after the first token to make sure all concurrent clients
@@ -349,11 +336,8 @@ int main(int argc, char ** argv) {
                     client.t_start_gen = ggml_time_us();
                 }
 
-                // remember which tokens were sampled - used for repetition penalties during sampling
-                client.tokens_prev.erase(client.tokens_prev.begin());
-                client.tokens_prev.push_back(id);
-
                 const std::string token_str = llama_token_to_piece(ctx, id);
+
                 client.response += token_str;
                 client.sampled = id;
 
@@ -361,7 +345,7 @@ int main(int argc, char ** argv) {
                 //        client.id, client.seq_id, id, client.n_decoded, client.i_batch, token_str.c_str());
 
                 if (client.n_decoded > 2 &&
-                        (id == llama_token_eos(ctx) ||
+                        (id == llama_token_eos(model) ||
                          (params.n_predict > 0 && client.n_decoded + client.n_prompt >= params.n_predict) ||
                          client.response.find("User:") != std::string::npos ||
                          client.response.find('\n') != std::string::npos)) {
@@ -386,7 +370,7 @@ int main(int argc, char ** argv) {
 
                     n_total_prompt += client.n_prompt;
                     n_total_gen    += client.n_decoded;
-                    llama_sampling_context_reset(ctx_sampling, client.seq_id);
+
                     client.seq_id = -1;
                 }
 
diff --git a/examples/perplexity/CMakeLists.txt b/examples/perplexity/CMakeLists.txt
index af00b4e16..3c76d3221 100644
--- a/examples/perplexity/CMakeLists.txt
+++ b/examples/perplexity/CMakeLists.txt
@@ -3,6 +3,3 @@ add_executable(${TARGET} perplexity.cpp)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
-if(TARGET BUILD_INFO)
-  add_dependencies(${TARGET} BUILD_INFO)
-endif()
diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp
index 7d0038bd4..de60c5227 100644
--- a/examples/perplexity/perplexity.cpp
+++ b/examples/perplexity/perplexity.cpp
@@ -1,4 +1,3 @@
-#include "build-info.h"
 #include "common.h"
 #include "llama.h"
 
@@ -210,7 +209,7 @@ static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params &
         const auto t_start = std::chrono::high_resolution_clock::now();
 
         // clear the KV cache
-        llama_kv_cache_tokens_rm(ctx, -1, -1);
+        llama_kv_cache_clear(ctx);
 
         for (int j = 0; j < num_batches; ++j) {
             const int batch_start = start + j * n_batch;
@@ -227,7 +226,7 @@ static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params &
 
             // add BOS token for the first batch of each chunk
             if (add_bos && j == 0) {
-                tokens[batch_start] = llama_token_bos(ctx);
+                tokens[batch_start] = llama_token_bos(llama_get_model(ctx));
             }
 
             const auto batch_logits = llama_get_logits(ctx);
@@ -339,7 +338,7 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
         const auto t_start = std::chrono::high_resolution_clock::now();
 
         // clear the KV cache
-        llama_kv_cache_tokens_rm(ctx, -1, -1);
+        llama_kv_cache_clear(ctx);
 
         for (int j = 0; j < num_batches; ++j) {
             const int batch_start = start + j * n_batch;
@@ -350,7 +349,7 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
 
             // add BOS token for the first batch of each chunk
             if (add_bos && j == 0) {
-                tokens[batch_start] = llama_token_bos(ctx);
+                tokens[batch_start] = llama_token_bos(llama_get_model(ctx));
             }
 
             if (llama_decode(ctx, llama_batch_get_one(tokens.data() + batch_start, batch_size, j * n_batch, 0))) {
@@ -573,7 +572,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
         }
 
         // clear the KV cache
-        llama_kv_cache_tokens_rm(ctx, -1, -1);
+        llama_kv_cache_clear(ctx);
 
         auto logits = hellaswag_evaluate_tokens(ctx, query_embd, 0, params.n_batch, n_vocab);
         if (logits.empty()) {
diff --git a/examples/quantize-stats/CMakeLists.txt b/examples/quantize-stats/CMakeLists.txt
index db182e263..e31cf5e38 100644
--- a/examples/quantize-stats/CMakeLists.txt
+++ b/examples/quantize-stats/CMakeLists.txt
@@ -1,6 +1,6 @@
 set(TARGET quantize-stats)
 add_executable(${TARGET} quantize-stats.cpp)
 install(TARGETS ${TARGET} RUNTIME)
-target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT})
+target_link_libraries(${TARGET} PRIVATE llama build_info ${CMAKE_THREAD_LIBS_INIT})
 target_include_directories(${TARGET} PRIVATE ../../common)
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
diff --git a/examples/quantize-stats/quantize-stats.cpp b/examples/quantize-stats/quantize-stats.cpp
index dd76b1cee..271282477 100644
--- a/examples/quantize-stats/quantize-stats.cpp
+++ b/examples/quantize-stats/quantize-stats.cpp
@@ -1,5 +1,4 @@
 #define LLAMA_API_INTERNAL
-#include "build-info.h"
 #include "common.h"
 #include "ggml.h"
 #include "llama.h"
diff --git a/examples/quantize/CMakeLists.txt b/examples/quantize/CMakeLists.txt
index 4a8eed544..6f374a2bd 100644
--- a/examples/quantize/CMakeLists.txt
+++ b/examples/quantize/CMakeLists.txt
@@ -1,9 +1,6 @@
 set(TARGET quantize)
 add_executable(${TARGET} quantize.cpp)
 install(TARGETS ${TARGET} RUNTIME)
-target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT})
+target_link_libraries(${TARGET} PRIVATE llama build_info ${CMAKE_THREAD_LIBS_INIT})
 target_include_directories(${TARGET} PRIVATE ../../common)
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
-if(TARGET BUILD_INFO)
-  add_dependencies(${TARGET} BUILD_INFO)
-endif()
diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp
index c7dd0d894..d27ea5e91 100644
--- a/examples/quantize/quantize.cpp
+++ b/examples/quantize/quantize.cpp
@@ -1,4 +1,3 @@
-#include "build-info.h"
 #include "common.h"
 #include "llama.h"
 
@@ -18,7 +17,6 @@ static const std::vector<struct quant_option> QUANT_OPTIONS = {
     { "Q4_1",   LLAMA_FTYPE_MOSTLY_Q4_1,   " 3.90G, +0.1585 ppl @ LLaMA-v1-7B", },
     { "Q5_0",   LLAMA_FTYPE_MOSTLY_Q5_0,   " 4.33G, +0.0683 ppl @ LLaMA-v1-7B", },
     { "Q5_1",   LLAMA_FTYPE_MOSTLY_Q5_1,   " 4.70G, +0.0349 ppl @ LLaMA-v1-7B", },
-#ifdef GGML_USE_K_QUANTS
     { "Q2_K",   LLAMA_FTYPE_MOSTLY_Q2_K,   " 2.63G, +0.6717 ppl @ LLaMA-v1-7B", },
     { "Q3_K",   LLAMA_FTYPE_MOSTLY_Q3_K_M, "alias for Q3_K_M" },
     { "Q3_K_S", LLAMA_FTYPE_MOSTLY_Q3_K_S, " 2.75G, +0.5551 ppl @ LLaMA-v1-7B", },
@@ -31,7 +29,6 @@ static const std::vector<struct quant_option> QUANT_OPTIONS = {
     { "Q5_K_S", LLAMA_FTYPE_MOSTLY_Q5_K_S, " 4.33G, +0.0400 ppl @ LLaMA-v1-7B", },
     { "Q5_K_M", LLAMA_FTYPE_MOSTLY_Q5_K_M, " 4.45G, +0.0122 ppl @ LLaMA-v1-7B", },
     { "Q6_K",   LLAMA_FTYPE_MOSTLY_Q6_K,   " 5.15G, -0.0008 ppl @ LLaMA-v1-7B", },
-#endif
     { "Q8_0",   LLAMA_FTYPE_MOSTLY_Q8_0,   " 6.70G, +0.0004 ppl @ LLaMA-v1-7B", },
     { "F16",    LLAMA_FTYPE_MOSTLY_F16,    "13.00G              @ 7B", },
     { "F32",    LLAMA_FTYPE_ALL_F32,       "26.00G              @ 7B", },
@@ -70,13 +67,14 @@ static bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftyp
 }
 
 // usage:
-//  ./quantize [--allow-requantize] [--leave-output-tensor] models/llama/ggml-model.gguf [models/llama/ggml-model-quant.gguf] type [nthreads]
+//  ./quantize [--allow-requantize] [--leave-output-tensor] [--pure] models/llama/ggml-model.gguf [models/llama/ggml-model-quant.gguf] type [nthreads]
 //
 [[noreturn]]
 static void usage(const char * executable) {
-    printf("usage: %s [--help] [--allow-requantize] [--leave-output-tensor] model-f32.gguf [model-quant.gguf] type [nthreads]\n\n", executable);
+    printf("usage: %s [--help] [--allow-requantize] [--leave-output-tensor] [--pure] model-f32.gguf [model-quant.gguf] type [nthreads]\n\n", executable);
     printf("  --allow-requantize: Allows requantizing tensors that have already been quantized. Warning: This can severely reduce quality compared to quantizing from 16bit or 32bit\n");
     printf("  --leave-output-tensor: Will leave output.weight un(re)quantized. Increases model size but may also increase quality, especially when requantizing\n");
+    printf("  --pure: Disable k-quant mixtures and quantize all tensors to the same type\n");
     printf("\nAllowed quantization types:\n");
     for (auto & it : QUANT_OPTIONS) {
         if (it.name != "COPY") {
@@ -103,6 +101,8 @@ int main(int argc, char ** argv) {
             params.quantize_output_tensor = false;
         } else if (strcmp(argv[arg_idx], "--allow-requantize") == 0) {
             params.allow_requantize = true;
+        } else if (strcmp(argv[arg_idx], "--pure") == 0) {
+            params.pure = true;
         } else {
             usage(argv[0]);
         }
diff --git a/examples/save-load-state/CMakeLists.txt b/examples/save-load-state/CMakeLists.txt
index eadd13cdf..cc6ed8554 100644
--- a/examples/save-load-state/CMakeLists.txt
+++ b/examples/save-load-state/CMakeLists.txt
@@ -3,6 +3,3 @@ add_executable(${TARGET} save-load-state.cpp)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
-if(TARGET BUILD_INFO)
-  add_dependencies(${TARGET} BUILD_INFO)
-endif()
diff --git a/examples/save-load-state/save-load-state.cpp b/examples/save-load-state/save-load-state.cpp
index f9e3c98a3..48d801110 100644
--- a/examples/save-load-state/save-load-state.cpp
+++ b/examples/save-load-state/save-load-state.cpp
@@ -1,4 +1,3 @@
-#include "build-info.h"
 #include "common.h"
 #include "llama.h"
 
@@ -8,10 +7,7 @@
 
 int main(int argc, char ** argv) {
     gpt_params params;
-    llama_sampling_params & sparams = params.sampling_params;
-    params.seed = 42;
-    params.n_threads = 4;
-    sparams.repeat_last_n = 64;
+
     params.prompt = "The quick brown fox";
 
     if (!gpt_params_parse(argc, argv, params)) {
@@ -25,56 +21,49 @@ int main(int argc, char ** argv) {
     }
 
     auto n_past = 0;
-    auto last_n_tokens_data = std::vector<llama_token>(sparams.repeat_last_n, 0);
+
+    std::string result0;
+    std::string result1;
 
     // init
     llama_model * model;
     llama_context * ctx;
 
-    std::tie(model, ctx) = llama_init_from_gpt_params( params );
-    if (model == nullptr) {
-        return 1;
-    }
-    if (ctx == nullptr) {
-        llama_free_model(model);
+    std::tie(model, ctx) = llama_init_from_gpt_params(params);
+    if (model == nullptr || ctx == nullptr) {
+        fprintf(stderr, "%s : failed to init\n", __func__);
         return 1;
     }
+
+    // tokenize prompt
     auto tokens = llama_tokenize(ctx, params.prompt, true);
-    auto n_prompt_tokens = tokens.size();
-    if (n_prompt_tokens < 1) {
-        fprintf(stderr, "%s : failed to tokenize prompt\n", __func__);
-        llama_free(ctx);
-        llama_free_model(model);
-        return 1;
-    }
 
     // evaluate prompt
-    llama_decode(ctx, llama_batch_get_one(tokens.data(), n_prompt_tokens, n_past, 0));
+    llama_decode(ctx, llama_batch_get_one(tokens.data(), tokens.size(), n_past, 0));
+    n_past += tokens.size();
 
-    last_n_tokens_data.insert(last_n_tokens_data.end(), tokens.data(), tokens.data() + n_prompt_tokens);
-    n_past += n_prompt_tokens;
-
-    const size_t state_size = llama_get_state_size(ctx);
-    uint8_t * state_mem = new uint8_t[state_size];
-
-    // Save state (rng, logits, embedding and kv_cache) to file
+    // save state (rng, logits, embedding and kv_cache) to file
     {
-        FILE *fp_write = fopen("dump_state.bin", "wb");
-        llama_copy_state_data(ctx, state_mem); // could also copy directly to memory mapped file
-        fwrite(state_mem, 1, state_size, fp_write);
-        fclose(fp_write);
+        std::vector<uint8_t> state_mem(llama_get_state_size(ctx));
+
+        {
+            FILE *fp_write = fopen("dump_state.bin", "wb");
+            llama_copy_state_data(ctx, state_mem.data()); // could also copy directly to memory mapped file
+            fwrite(state_mem.data(), 1, state_mem.size(), fp_write);
+            fclose(fp_write);
+        }
     }
 
     // save state (last tokens)
-    const auto last_n_tokens_data_saved = std::vector<llama_token>(last_n_tokens_data);
     const auto n_past_saved = n_past;
 
     // first run
-    printf("\n%s", params.prompt.c_str());
+    printf("\nfirst run: %s", params.prompt.c_str());
 
     for (auto i = 0; i < params.n_predict; i++) {
         auto * logits = llama_get_logits(ctx);
         auto n_vocab = llama_n_vocab(model);
+
         std::vector<llama_token_data> candidates;
         candidates.reserve(n_vocab);
         for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
@@ -83,9 +72,10 @@ int main(int argc, char ** argv) {
         llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
         auto next_token = llama_sample_token(ctx, &candidates_p);
         auto next_token_str = llama_token_to_piece(ctx, next_token);
-        last_n_tokens_data.push_back(next_token);
 
         printf("%s", next_token_str.c_str());
+        result0 += next_token_str;
+
         if (llama_decode(ctx, llama_batch_get_one(&next_token, 1, n_past, 0))) {
             fprintf(stderr, "\n%s : failed to evaluate\n", __func__);
             llama_free(ctx);
@@ -103,32 +93,28 @@ int main(int argc, char ** argv) {
     // make new context
     auto * ctx2 = llama_new_context_with_model(model, llama_context_params_from_gpt_params(params));
 
-    // Load state (rng, logits, embedding and kv_cache) from file
-    {
-        FILE *fp_read = fopen("dump_state.bin", "rb");
-        if (state_size != llama_get_state_size(ctx2)) {
-            fprintf(stderr, "\n%s : failed to validate state size\n", __func__);
-            llama_free(ctx2);
-            llama_free_model(model);
-            return 1;
-        }
+    printf("\nsecond run: %s", params.prompt.c_str());
 
-        const size_t ret = fread(state_mem, 1, state_size, fp_read);
-        if (ret != state_size) {
+    // load state (rng, logits, embedding and kv_cache) from file
+    {
+        std::vector<uint8_t> state_mem(llama_get_state_size(ctx2));
+
+        FILE * fp_read = fopen("dump_state.bin", "rb");
+
+        const size_t ret = fread(state_mem.data(), 1, state_mem.size(), fp_read);
+        if (ret != state_mem.size()) {
             fprintf(stderr, "\n%s : failed to read state\n", __func__);
             llama_free(ctx2);
             llama_free_model(model);
             return 1;
         }
 
-        llama_set_state_data(ctx2, state_mem);  // could also read directly from memory mapped file
+        llama_set_state_data(ctx2, state_mem.data());
+
         fclose(fp_read);
     }
 
-    delete[] state_mem;
-
     // restore state (last tokens)
-    last_n_tokens_data = last_n_tokens_data_saved;
     n_past = n_past_saved;
 
     // second run
@@ -143,10 +129,11 @@ int main(int argc, char ** argv) {
         llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
         auto next_token = llama_sample_token(ctx2, &candidates_p);
         auto next_token_str = llama_token_to_piece(ctx2, next_token);
-        last_n_tokens_data.push_back(next_token);
 
         printf("%s", next_token_str.c_str());
-        if (llama_decode(ctx, llama_batch_get_one(&next_token, 1, n_past, 0))) {
+        result1 += next_token_str;
+
+        if (llama_decode(ctx2, llama_batch_get_one(&next_token, 1, n_past, 0))) {
             fprintf(stderr, "\n%s : failed to evaluate\n", __func__);
             llama_free(ctx2);
             llama_free_model(model);
@@ -155,10 +142,17 @@ int main(int argc, char ** argv) {
         n_past += 1;
     }
 
-    printf("\n\n");
+    printf("\n");
 
     llama_free(ctx2);
     llama_free_model(model);
 
+    if (result0 != result1) {
+        fprintf(stderr, "\n%s : error : the 2 generations are different\n", __func__);
+        return 1;
+    }
+
+    fprintf(stderr, "\n%s : success\n", __func__);
+
     return 0;
 }
diff --git a/examples/server/CMakeLists.txt b/examples/server/CMakeLists.txt
index 3782f9b80..1f0d26f77 100644
--- a/examples/server/CMakeLists.txt
+++ b/examples/server/CMakeLists.txt
@@ -6,11 +6,8 @@ install(TARGETS ${TARGET} RUNTIME)
 target_compile_definitions(${TARGET} PRIVATE
     SERVER_VERBOSE=$<BOOL:${LLAMA_SERVER_VERBOSE}>
 )
-target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_link_libraries(${TARGET} PRIVATE common llama clip ${CMAKE_THREAD_LIBS_INIT})
 if (WIN32)
     TARGET_LINK_LIBRARIES(${TARGET} PRIVATE ws2_32)
 endif()
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
-if(TARGET BUILD_INFO)
-  add_dependencies(${TARGET} BUILD_INFO)
-endif()
diff --git a/examples/server/README.md b/examples/server/README.md
index 0a9c2c06d..d6597d519 100644
--- a/examples/server/README.md
+++ b/examples/server/README.md
@@ -24,6 +24,10 @@ Command line options:
 -   `--port`: Set the port to listen. Default: `8080`.
 -   `--path`: path from which to serve static files (default examples/server/public)
 -   `--embedding`: Enable embedding extraction, Default: disabled.
+-   `-np N`, `--parallel N`: Set the number of slots for process requests (default: 1)
+-   `-cb`, `--cont-batching`: enable continuous batching (a.k.a dynamic batching) (default: disabled)
+-   `-spf FNAME`, `--system-prompt-file FNAME` Set a file to load "a system prompt (initial prompt of all slots), this is useful for chat applications. [See more](#change-system-prompt-on-runtime)
+-   `--mmproj MMPROJ_FILE`: Path to a multimodal projector file for LLaVA.
 
 ## Build
 
@@ -106,25 +110,25 @@ node index.js
 
 ## API Endpoints
 
--   **POST** `/completion`: Given a prompt, it returns the predicted completion.
+-   **POST** `/completion`: Given a `prompt`, it returns the predicted completion.
 
     *Options:*
 
+    `prompt`: Provide the prompt for this completion as a string or as an array of strings or numbers representing tokens. Internally, the prompt is compared to the previous completion and only the "unseen" suffix is evaluated. If the prompt is a string or an array with the first element given as a string, a `bos` token is inserted in the front like `main` does.
+
     `temperature`: Adjust the randomness of the generated text (default: 0.8).
 
     `top_k`: Limit the next token selection to the K most probable tokens (default: 40).
 
     `top_p`: Limit the next token selection to a subset of tokens with a cumulative probability above a threshold P (default: 0.95).
 
-    `n_predict`: Set the number of tokens to predict when generating text. **Note:** May exceed the set limit slightly if the last token is a partial multibyte character. When 0, no tokens will be generated but the prompt is evaluated into the cache. (default: -1, -1 = infinity).
+    `n_predict`: Set the maximum number of tokens to predict when generating text. **Note:** May exceed the set limit slightly if the last token is a partial multibyte character. When 0, no tokens will be generated but the prompt is evaluated into the cache. (default: -1, -1 = infinity).
 
-    `n_keep`: Specify the number of tokens from the initial prompt to retain when the model resets its internal context.
-    By default, this value is set to 0 (meaning no tokens are kept). Use `-1` to retain all tokens from the initial prompt.
+    `n_keep`: Specify the number of tokens from the prompt to retain when the context size is exceeded and tokens need to be discarded.
+    By default, this value is set to 0 (meaning no tokens are kept). Use `-1` to retain all tokens from the prompt.
 
     `stream`: It allows receiving each predicted token in real-time instead of waiting for the completion to finish. To enable this, set to `true`.
 
-    `prompt`: Provide a prompt as a string, or as an array of strings and numbers representing tokens. Internally, the prompt is compared, and it detects if a part has already been evaluated, and the remaining part will be evaluate. If the prompt is a string, or an array with the first element given as a string, a space is inserted in the front like main.cpp does.
-
     `stop`: Specify a JSON array of stopping strings.
     These words will not be included in the completion, so make sure to add them to the prompt for the next iteration (default: []).
 
@@ -158,6 +162,44 @@ node index.js
 
     `n_probs`: If greater than 0, the response also contains the probabilities of top N tokens for each generated token (default: 0)
 
+    `image_data`: An array of objects to hold base64-encoded image `data` and its `id`s to be reference in `prompt`. You can determine the place of the image in the prompt as in the following: `USER:[img-12]Describe the image in detail.\nASSISTANT:` In this case, `[img-12]` will be replaced by the embeddings of the image id 12 in the following `image_data` array: `{..., "image_data": [{"data": "<BASE64_STRING>", "id": 12}]}`. Use `image_data` only with multimodal models, e.g., LLaVA.
+
+    *Result JSON:*
+
+    Note: When using streaming mode (`stream`) only `content` and `stop` will be returned until end of completion.
+
+    `content`: Completion result as a string (excluding `stopping_word` if any). In case of streaming mode, will contain the next token as a string.
+
+    `stop`: Boolean for use with `stream` to check whether the generation has stopped (Note: This is not related to stopping words array `stop` from input options)
+
+    `generation_settings`: The provided options above excluding `prompt` but including `n_ctx`, `model`
+
+    `model`: The path to the model loaded with `-m`
+
+    `prompt`: The provided `prompt`
+
+    `stopped_eos`: Indicating whether the completion has stopped because it encountered the EOS token
+
+    `stopped_limit`: Indicating whether the completion stopped because `n_predict` tokens were generated before stop words or EOS was encountered
+
+    `stopped_word`: Indicating whether the completion stopped due to encountering a stopping word from `stop` JSON array provided
+
+    `stopping_word`: The stopping word encountered which stopped the generation (or "" if not stopped due to a stopping word)
+
+    `timings`: Hash of timing information about the completion such as the number of tokens `predicted_per_second`
+
+    `tokens_cached`: Number of tokens from the prompt which could be re-used from previous completion (`n_past`)
+
+    `tokens_evaluated`: Number of tokens evaluated in total from the prompt
+
+    `truncated`: Boolean indicating if the context size was exceeded during generation, i.e. the number of tokens provided in the prompt (`tokens_evaluated`) plus tokens generated (`tokens predicted`) exceeded the context size (`n_ctx`)
+
+    `slot_id`: Assign the completion task to an specific slot. If is -1 the task will be assigned to a Idle slot (default: -1)
+
+    `cache_prompt`: Save the prompt and generation for avoid reprocess entire prompt if a part of this isn't change (default: false)
+
+    `system_prompt`: Change the system prompt (initial prompt of all slots), this is useful for chat applications. [See more](#change-system-prompt-on-runtime)
+
 -   **POST** `/tokenize`: Tokenize a given text.
 
     *Options:*
@@ -188,8 +230,32 @@ node index.js
 
     It also accepts all the options of `/completion` except `stream` and `prompt`.
 
+-   **GET** `/props`: Return the required assistant name and anti-prompt to generate the prompt in case you have specified a system prompt for all slots.
+
 ## More examples
 
+### Change system prompt on runtime
+
+To use the server example to serve multiple chat-type clients while keeping the same system prompt, you can utilize the option `system_prompt` to achieve that. This only needs to be done once to establish it.
+
+`prompt`: Specify a context that you want all connecting clients to respect.
+
+`anti_prompt`: Specify the word you want to use to instruct the model to stop. This must be sent to each client through the `/props` endpoint.
+
+`assistant_name`: The bot's name is necessary for each customer to generate the prompt. This must be sent to each client through the `/props` endpoint.
+
+```json
+{
+    "system_prompt": {
+        "prompt": "Transcript of a never ending dialog, where the User interacts with an Assistant.\nThe Assistant is helpful, kind, honest, good at writing, and never fails to answer the User's requests immediately and with precision.\nUser: Recommend a nice restaurant in the area.\nAssistant: I recommend the restaurant \"The Golden Duck\". It is a 5 star restaurant with a great view of the city. The food is delicious and the service is excellent. The prices are reasonable and the portions are generous. The restaurant is located at 123 Main Street, New York, NY 10001. The phone number is (212) 555-1234. The hours are Monday through Friday from 11:00 am to 10:00 pm. The restaurant is closed on Saturdays and Sundays.\nUser: Who is Richard Feynman?\nAssistant: Richard Feynman was an American physicist who is best known for his work in quantum mechanics and particle physics. He was awarded the Nobel Prize in Physics in 1965 for his contributions to the development of quantum electrodynamics. He was a popular lecturer and author, and he wrote several books, including \"Surely You're Joking, Mr. Feynman!\" and \"What Do You Care What Other People Think?\".\nUser:",
+        "anti_prompt": "User:",
+        "assistant_name": "Assistant:"
+    }
+}
+```
+
+**NOTE**: You can do this automatically when starting the server by simply creating a .json file with these options and using the CLI option `-spf FNAME` or `--system-prompt-file FNAME`.
+
 ### Interactive mode
 
 Check the sample in [chat.mjs](chat.mjs).
diff --git a/examples/server/api_like_OAI.py b/examples/server/api_like_OAI.py
index 14d2dcf65..313e1a965 100755
--- a/examples/server/api_like_OAI.py
+++ b/examples/server/api_like_OAI.py
@@ -8,6 +8,7 @@ import json
 
 
 app = Flask(__name__)
+slot_id = -1
 
 parser = argparse.ArgumentParser(description="An example of using server.cpp with a similar API to OAI. It must be used together with server.cpp.")
 parser.add_argument("--chat-prompt", type=str, help="the top prompt in chat completions(default: 'A chat between a curious user and an artificial intelligence assistant. The assistant follows the given rules no matter what.\\n')", default='A chat between a curious user and an artificial intelligence assistant. The assistant follows the given rules no matter what.\\n')
@@ -77,7 +78,8 @@ def make_postData(body, chat=False, stream=False):
     if(is_present(body, "stop")): postData["stop"] += body["stop"]
     postData["n_keep"] = -1
     postData["stream"] = stream
-
+    postData["cache_prompt"] = True
+    postData["slot_id"] = slot_id
     return postData
 
 def make_resData(data, chat=False, promptToken=[]):
@@ -128,6 +130,7 @@ def make_resData_stream(data, chat=False, time_now = 0, start=False):
             }
         ]
     }
+    slot_id = data["slot_id"]
     if (chat):
         if (start):
             resData["choices"][0]["delta"] =  {
diff --git a/examples/server/chat.mjs b/examples/server/chat.mjs
index 87f4d2926..219ebb51a 100644
--- a/examples/server/chat.mjs
+++ b/examples/server/chat.mjs
@@ -7,6 +7,11 @@ const args = process.argv.slice(2);
 const grammarJsonSchemaFile = args.find(
     (_, index) => args[index - 1] === "--grammar-json-schema"
 );
+
+const no_cached_prompt = args.find(
+    (_, index) => args[index - 1] === "--no-cache-prompt"
+) ?? "false";
+
 const grammarFile = args.find((_, index) => args[index - 1] === "--grammar");
 
 // Example usage: function,arguments
@@ -30,6 +35,9 @@ if (grammarFile) {
     grammar = readFileSync(grammarFile, 'utf-8')
 }
 
+// for cached prompt
+let slot_id = -1;
+
 const API_URL = 'http://127.0.0.1:8080'
 
 const chat = [
@@ -76,6 +84,8 @@ async function chat_completion(question) {
             top_p: 0.9,
             n_keep: n_keep,
             n_predict: 256,
+            cache_prompt: no_cached_prompt === "false",
+            slot_id: slot_id,
             stop: ["\n### Human:"], // stop completion after generating this
             grammar,
             stream: true,
@@ -92,6 +102,7 @@ async function chat_completion(question) {
         const t = Buffer.from(chunk).toString('utf8')
         if (t.startsWith('data: ')) {
             const message = JSON.parse(t.substring(6))
+            slot_id = message.slot_id
             answer += message.content
             process.stdout.write(message.content)
             if (message.stop) {
diff --git a/examples/server/index.html.hpp b/examples/server/index.html.hpp
index 58e3387d1..5d3bdfbdd 100644
--- a/examples/server/index.html.hpp
+++ b/examples/server/index.html.hpp
@@ -186,351 +186,369 @@ unsigned char index_html[] = {
   0x64, 0x2d, 0x63, 0x6f, 0x6c, 0x6f, 0x72, 0x3a, 0x20, 0x23, 0x32, 0x32,
   0x32, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6c,
   0x6f, 0x72, 0x3a, 0x20, 0x23, 0x64, 0x64, 0x64, 0x3b, 0x0a, 0x20, 0x20,
-  0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x64, 0x65,
-  0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x66, 0x6f, 0x6e,
-  0x74, 0x2d, 0x66, 0x61, 0x6d, 0x69, 0x6c, 0x79, 0x3a, 0x20, 0x6d, 0x6f,
-  0x6e, 0x6f, 0x73, 0x70, 0x61, 0x63, 0x65, 0x3b, 0x0a, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x70, 0x61, 0x64, 0x64, 0x69, 0x6e, 0x67, 0x3a, 0x20,
-  0x30, 0x2e, 0x31, 0x65, 0x6d, 0x20, 0x30, 0x2e, 0x33, 0x65, 0x6d, 0x3b,
-  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x62, 0x6f, 0x72, 0x64, 0x65,
-  0x72, 0x2d, 0x72, 0x61, 0x64, 0x69, 0x75, 0x73, 0x3a, 0x20, 0x33, 0x70,
-  0x78, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20,
-  0x20, 0x20, 0x66, 0x69, 0x65, 0x6c, 0x64, 0x73, 0x65, 0x74, 0x20, 0x6c,
-  0x61, 0x62, 0x65, 0x6c, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x6d, 0x61, 0x72, 0x67, 0x69, 0x6e, 0x3a, 0x20, 0x30, 0x2e, 0x35,
-  0x65, 0x6d, 0x20, 0x30, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x64, 0x69, 0x73, 0x70, 0x6c, 0x61, 0x79, 0x3a, 0x20, 0x62, 0x6c, 0x6f,
-  0x63, 0x6b, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20,
+  0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x64,
+  0x65, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x66, 0x6f,
+  0x6e, 0x74, 0x2d, 0x66, 0x61, 0x6d, 0x69, 0x6c, 0x79, 0x3a, 0x20, 0x6d,
+  0x6f, 0x6e, 0x6f, 0x73, 0x70, 0x61, 0x63, 0x65, 0x3b, 0x0a, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x70, 0x61, 0x64, 0x64, 0x69, 0x6e, 0x67, 0x3a,
+  0x20, 0x30, 0x2e, 0x31, 0x65, 0x6d, 0x20, 0x30, 0x2e, 0x33, 0x65, 0x6d,
+  0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x62, 0x6f, 0x72, 0x64,
+  0x65, 0x72, 0x2d, 0x72, 0x61, 0x64, 0x69, 0x75, 0x73, 0x3a, 0x20, 0x33,
+  0x70, 0x78, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20,
   0x20, 0x20, 0x20, 0x66, 0x69, 0x65, 0x6c, 0x64, 0x73, 0x65, 0x74, 0x20,
-  0x6c, 0x61, 0x62, 0x65, 0x6c, 0x2e, 0x73, 0x6c, 0x69, 0x6d, 0x20, 0x7b,
-  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6d, 0x61, 0x72, 0x67, 0x69,
-  0x6e, 0x3a, 0x20, 0x30, 0x20, 0x30, 0x2e, 0x35, 0x65, 0x6d, 0x3b, 0x0a,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x64, 0x69, 0x73, 0x70, 0x6c, 0x61,
-  0x79, 0x3a, 0x20, 0x69, 0x6e, 0x6c, 0x69, 0x6e, 0x65, 0x3b, 0x0a, 0x20,
-  0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x68, 0x65,
-  0x61, 0x64, 0x65, 0x72, 0x2c, 0x20, 0x66, 0x6f, 0x6f, 0x74, 0x65, 0x72,
-  0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x74, 0x65, 0x78,
-  0x74, 0x2d, 0x61, 0x6c, 0x69, 0x67, 0x6e, 0x3a, 0x20, 0x63, 0x65, 0x6e,
-  0x74, 0x65, 0x72, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a,
-  0x20, 0x20, 0x20, 0x20, 0x66, 0x6f, 0x6f, 0x74, 0x65, 0x72, 0x20, 0x7b,
-  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x66, 0x6f, 0x6e, 0x74, 0x2d,
-  0x73, 0x69, 0x7a, 0x65, 0x3a, 0x20, 0x38, 0x30, 0x25, 0x3b, 0x0a, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6c, 0x6f, 0x72, 0x3a, 0x20,
-  0x23, 0x38, 0x38, 0x38, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a,
+  0x6c, 0x61, 0x62, 0x65, 0x6c, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x6d, 0x61, 0x72, 0x67, 0x69, 0x6e, 0x3a, 0x20, 0x30, 0x2e,
+  0x35, 0x65, 0x6d, 0x20, 0x30, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x64, 0x69, 0x73, 0x70, 0x6c, 0x61, 0x79, 0x3a, 0x20, 0x62, 0x6c,
+  0x6f, 0x63, 0x6b, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a,
+  0x20, 0x20, 0x20, 0x20, 0x66, 0x69, 0x65, 0x6c, 0x64, 0x73, 0x65, 0x74,
+  0x20, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x2e, 0x73, 0x6c, 0x69, 0x6d, 0x20,
+  0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6d, 0x61, 0x72, 0x67,
+  0x69, 0x6e, 0x3a, 0x20, 0x30, 0x20, 0x30, 0x2e, 0x35, 0x65, 0x6d, 0x3b,
+  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x64, 0x69, 0x73, 0x70, 0x6c,
+  0x61, 0x79, 0x3a, 0x20, 0x69, 0x6e, 0x6c, 0x69, 0x6e, 0x65, 0x3b, 0x0a,
+  0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x68,
+  0x65, 0x61, 0x64, 0x65, 0x72, 0x2c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x66,
+  0x6f, 0x6f, 0x74, 0x65, 0x72, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x74, 0x65, 0x78, 0x74, 0x2d, 0x61, 0x6c, 0x69, 0x67, 0x6e,
+  0x3a, 0x20, 0x63, 0x65, 0x6e, 0x74, 0x65, 0x72, 0x3b, 0x0a, 0x20, 0x20,
+  0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x66, 0x6f, 0x6f,
+  0x74, 0x65, 0x72, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x66, 0x6f, 0x6e, 0x74, 0x2d, 0x73, 0x69, 0x7a, 0x65, 0x3a, 0x20, 0x38,
+  0x30, 0x25, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f,
+  0x6c, 0x6f, 0x72, 0x3a, 0x20, 0x23, 0x38, 0x38, 0x38, 0x3b, 0x0a, 0x20,
+  0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x2e, 0x6d,
+  0x6f, 0x64, 0x65, 0x2d, 0x63, 0x68, 0x61, 0x74, 0x20, 0x74, 0x65, 0x78,
+  0x74, 0x61, 0x72, 0x65, 0x61, 0x5b, 0x6e, 0x61, 0x6d, 0x65, 0x3d, 0x70,
+  0x72, 0x6f, 0x6d, 0x70, 0x74, 0x5d, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x68, 0x65, 0x69, 0x67, 0x68, 0x74, 0x3a, 0x20, 0x34,
+  0x2e, 0x35, 0x65, 0x6d, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a,
   0x0a, 0x20, 0x20, 0x20, 0x20, 0x2e, 0x6d, 0x6f, 0x64, 0x65, 0x2d, 0x63,
-  0x68, 0x61, 0x74, 0x20, 0x74, 0x65, 0x78, 0x74, 0x61, 0x72, 0x65, 0x61,
-  0x5b, 0x6e, 0x61, 0x6d, 0x65, 0x3d, 0x70, 0x72, 0x6f, 0x6d, 0x70, 0x74,
-  0x5d, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x68, 0x65,
-  0x69, 0x67, 0x68, 0x74, 0x3a, 0x20, 0x34, 0x2e, 0x35, 0x65, 0x6d, 0x3b,
-  0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20,
-  0x2e, 0x6d, 0x6f, 0x64, 0x65, 0x2d, 0x63, 0x6f, 0x6d, 0x70, 0x6c, 0x65,
-  0x74, 0x69, 0x6f, 0x6e, 0x20, 0x74, 0x65, 0x78, 0x74, 0x61, 0x72, 0x65,
-  0x61, 0x5b, 0x6e, 0x61, 0x6d, 0x65, 0x3d, 0x70, 0x72, 0x6f, 0x6d, 0x70,
-  0x74, 0x5d, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x68,
-  0x65, 0x69, 0x67, 0x68, 0x74, 0x3a, 0x20, 0x31, 0x30, 0x65, 0x6d, 0x3b,
-  0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x0a, 0x20, 0x20, 0x20,
-  0x20, 0x40, 0x6b, 0x65, 0x79, 0x66, 0x72, 0x61, 0x6d, 0x65, 0x73, 0x20,
-  0x6c, 0x6f, 0x61, 0x64, 0x69, 0x6e, 0x67, 0x2d, 0x62, 0x67, 0x2d, 0x77,
-  0x69, 0x70, 0x65, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x6f, 0x6d, 0x70, 0x6c, 0x65, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x74, 0x65,
+  0x78, 0x74, 0x61, 0x72, 0x65, 0x61, 0x5b, 0x6e, 0x61, 0x6d, 0x65, 0x3d,
+  0x70, 0x72, 0x6f, 0x6d, 0x70, 0x74, 0x5d, 0x20, 0x7b, 0x0a, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x68, 0x65, 0x69, 0x67, 0x68, 0x74, 0x3a, 0x20,
+  0x31, 0x30, 0x65, 0x6d, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a,
+  0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x40, 0x6b, 0x65, 0x79, 0x66, 0x72,
+  0x61, 0x6d, 0x65, 0x73, 0x20, 0x6c, 0x6f, 0x61, 0x64, 0x69, 0x6e, 0x67,
+  0x2d, 0x62, 0x67, 0x2d, 0x77, 0x69, 0x70, 0x65, 0x20, 0x7b, 0x0a, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x30, 0x25, 0x20, 0x7b, 0x0a, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x62, 0x61, 0x63, 0x6b, 0x67, 0x72,
+  0x6f, 0x75, 0x6e, 0x64, 0x2d, 0x70, 0x6f, 0x73, 0x69, 0x74, 0x69, 0x6f,
+  0x6e, 0x3a, 0x20, 0x30, 0x25, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x31, 0x30,
   0x30, 0x25, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
   0x20, 0x62, 0x61, 0x63, 0x6b, 0x67, 0x72, 0x6f, 0x75, 0x6e, 0x64, 0x2d,
-  0x70, 0x6f, 0x73, 0x69, 0x74, 0x69, 0x6f, 0x6e, 0x3a, 0x20, 0x30, 0x25,
-  0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x31, 0x30, 0x30, 0x25, 0x20, 0x7b, 0x0a, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x62, 0x61, 0x63, 0x6b, 0x67,
-  0x72, 0x6f, 0x75, 0x6e, 0x64, 0x2d, 0x70, 0x6f, 0x73, 0x69, 0x74, 0x69,
-  0x6f, 0x6e, 0x3a, 0x20, 0x31, 0x30, 0x30, 0x25, 0x3b, 0x0a, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a,
-  0x0a, 0x20, 0x20, 0x20, 0x20, 0x2e, 0x6c, 0x6f, 0x61, 0x64, 0x69, 0x6e,
-  0x67, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2d, 0x2d,
+  0x70, 0x6f, 0x73, 0x69, 0x74, 0x69, 0x6f, 0x6e, 0x3a, 0x20, 0x31, 0x30,
+  0x30, 0x25, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a,
+  0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x2e,
+  0x6c, 0x6f, 0x61, 0x64, 0x69, 0x6e, 0x67, 0x20, 0x7b, 0x0a, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x2d, 0x2d, 0x6c, 0x6f, 0x61, 0x64, 0x69, 0x6e,
+  0x67, 0x2d, 0x63, 0x6f, 0x6c, 0x6f, 0x72, 0x2d, 0x31, 0x3a, 0x20, 0x23,
+  0x65, 0x65, 0x65, 0x65, 0x65, 0x65, 0x30, 0x30, 0x3b, 0x0a, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x2d, 0x2d, 0x6c, 0x6f, 0x61, 0x64, 0x69, 0x6e,
+  0x67, 0x2d, 0x63, 0x6f, 0x6c, 0x6f, 0x72, 0x2d, 0x32, 0x3a, 0x20, 0x23,
+  0x65, 0x65, 0x65, 0x65, 0x65, 0x65, 0x66, 0x66, 0x3b, 0x0a, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x62, 0x61, 0x63, 0x6b, 0x67, 0x72, 0x6f, 0x75,
+  0x6e, 0x64, 0x2d, 0x73, 0x69, 0x7a, 0x65, 0x3a, 0x20, 0x35, 0x30, 0x25,
+  0x20, 0x31, 0x30, 0x30, 0x25, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x62, 0x61, 0x63, 0x6b, 0x67, 0x72, 0x6f, 0x75, 0x6e, 0x64, 0x2d,
+  0x69, 0x6d, 0x61, 0x67, 0x65, 0x3a, 0x20, 0x6c, 0x69, 0x6e, 0x65, 0x61,
+  0x72, 0x2d, 0x67, 0x72, 0x61, 0x64, 0x69, 0x65, 0x6e, 0x74, 0x28, 0x39,
+  0x30, 0x64, 0x65, 0x67, 0x2c, 0x20, 0x76, 0x61, 0x72, 0x28, 0x2d, 0x2d,
   0x6c, 0x6f, 0x61, 0x64, 0x69, 0x6e, 0x67, 0x2d, 0x63, 0x6f, 0x6c, 0x6f,
-  0x72, 0x2d, 0x31, 0x3a, 0x20, 0x23, 0x65, 0x65, 0x65, 0x65, 0x65, 0x65,
-  0x30, 0x30, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2d, 0x2d,
+  0x72, 0x2d, 0x31, 0x29, 0x2c, 0x20, 0x76, 0x61, 0x72, 0x28, 0x2d, 0x2d,
   0x6c, 0x6f, 0x61, 0x64, 0x69, 0x6e, 0x67, 0x2d, 0x63, 0x6f, 0x6c, 0x6f,
-  0x72, 0x2d, 0x32, 0x3a, 0x20, 0x23, 0x65, 0x65, 0x65, 0x65, 0x65, 0x65,
-  0x66, 0x66, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x62, 0x61,
-  0x63, 0x6b, 0x67, 0x72, 0x6f, 0x75, 0x6e, 0x64, 0x2d, 0x73, 0x69, 0x7a,
-  0x65, 0x3a, 0x20, 0x35, 0x30, 0x25, 0x20, 0x31, 0x30, 0x30, 0x25, 0x3b,
-  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x62, 0x61, 0x63, 0x6b, 0x67,
-  0x72, 0x6f, 0x75, 0x6e, 0x64, 0x2d, 0x69, 0x6d, 0x61, 0x67, 0x65, 0x3a,
-  0x20, 0x6c, 0x69, 0x6e, 0x65, 0x61, 0x72, 0x2d, 0x67, 0x72, 0x61, 0x64,
-  0x69, 0x65, 0x6e, 0x74, 0x28, 0x39, 0x30, 0x64, 0x65, 0x67, 0x2c, 0x20,
-  0x76, 0x61, 0x72, 0x28, 0x2d, 0x2d, 0x6c, 0x6f, 0x61, 0x64, 0x69, 0x6e,
-  0x67, 0x2d, 0x63, 0x6f, 0x6c, 0x6f, 0x72, 0x2d, 0x31, 0x29, 0x2c, 0x20,
-  0x76, 0x61, 0x72, 0x28, 0x2d, 0x2d, 0x6c, 0x6f, 0x61, 0x64, 0x69, 0x6e,
-  0x67, 0x2d, 0x63, 0x6f, 0x6c, 0x6f, 0x72, 0x2d, 0x32, 0x29, 0x2c, 0x20,
-  0x76, 0x61, 0x72, 0x28, 0x2d, 0x2d, 0x6c, 0x6f, 0x61, 0x64, 0x69, 0x6e,
-  0x67, 0x2d, 0x63, 0x6f, 0x6c, 0x6f, 0x72, 0x2d, 0x31, 0x29, 0x29, 0x3b,
-  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x61, 0x6e, 0x69, 0x6d, 0x61,
-  0x74, 0x69, 0x6f, 0x6e, 0x3a, 0x20, 0x6c, 0x6f, 0x61, 0x64, 0x69, 0x6e,
-  0x67, 0x2d, 0x62, 0x67, 0x2d, 0x77, 0x69, 0x70, 0x65, 0x20, 0x32, 0x73,
-  0x20, 0x6c, 0x69, 0x6e, 0x65, 0x61, 0x72, 0x20, 0x69, 0x6e, 0x66, 0x69,
-  0x6e, 0x69, 0x74, 0x65, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a,
-  0x0a, 0x20, 0x20, 0x20, 0x20, 0x40, 0x6d, 0x65, 0x64, 0x69, 0x61, 0x20,
-  0x28, 0x70, 0x72, 0x65, 0x66, 0x65, 0x72, 0x73, 0x2d, 0x63, 0x6f, 0x6c,
-  0x6f, 0x72, 0x2d, 0x73, 0x63, 0x68, 0x65, 0x6d, 0x65, 0x3a, 0x20, 0x64,
-  0x61, 0x72, 0x6b, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x2e, 0x6c, 0x6f, 0x61, 0x64, 0x69, 0x6e, 0x67, 0x20, 0x7b, 0x0a,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2d, 0x2d, 0x6c, 0x6f,
-  0x61, 0x64, 0x69, 0x6e, 0x67, 0x2d, 0x63, 0x6f, 0x6c, 0x6f, 0x72, 0x2d,
-  0x31, 0x3a, 0x20, 0x23, 0x32, 0x32, 0x32, 0x32, 0x32, 0x32, 0x30, 0x30,
-  0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2d, 0x2d,
+  0x72, 0x2d, 0x32, 0x29, 0x2c, 0x20, 0x76, 0x61, 0x72, 0x28, 0x2d, 0x2d,
   0x6c, 0x6f, 0x61, 0x64, 0x69, 0x6e, 0x67, 0x2d, 0x63, 0x6f, 0x6c, 0x6f,
-  0x72, 0x2d, 0x32, 0x3a, 0x20, 0x23, 0x32, 0x32, 0x32, 0x32, 0x32, 0x32,
-  0x66, 0x66, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2e, 0x70, 0x6f, 0x70, 0x6f, 0x76,
-  0x65, 0x72, 0x2d, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x20, 0x7b,
-  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x62, 0x61, 0x63,
-  0x6b, 0x67, 0x72, 0x6f, 0x75, 0x6e, 0x64, 0x2d, 0x63, 0x6f, 0x6c, 0x6f,
-  0x72, 0x3a, 0x20, 0x62, 0x6c, 0x61, 0x63, 0x6b, 0x3b, 0x0a, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a,
-  0x20, 0x20, 0x3c, 0x2f, 0x73, 0x74, 0x79, 0x6c, 0x65, 0x3e, 0x0a, 0x0a,
-  0x20, 0x20, 0x3c, 0x73, 0x63, 0x72, 0x69, 0x70, 0x74, 0x20, 0x74, 0x79,
-  0x70, 0x65, 0x3d, 0x22, 0x6d, 0x6f, 0x64, 0x75, 0x6c, 0x65, 0x22, 0x3e,
-  0x0a, 0x20, 0x20, 0x20, 0x20, 0x69, 0x6d, 0x70, 0x6f, 0x72, 0x74, 0x20,
-  0x7b, 0x43, 0x6f, 0x6d, 0x70, 0x6f, 0x6e, 0x65, 0x6e, 0x74, 0x2c, 0x20,
-  0x63, 0x6f, 0x6d, 0x70, 0x75, 0x74, 0x65, 0x64, 0x2c, 0x20, 0x68, 0x2c,
-  0x20, 0x68, 0x74, 0x6d, 0x6c, 0x2c, 0x20, 0x72, 0x65, 0x6e, 0x64, 0x65,
-  0x72, 0x2c, 0x20, 0x73, 0x69, 0x67, 0x6e, 0x61, 0x6c, 0x2c, 0x20, 0x75,
-  0x73, 0x65, 0x45, 0x66, 0x66, 0x65, 0x63, 0x74, 0x2c, 0x20, 0x75, 0x73,
-  0x65, 0x52, 0x65, 0x66, 0x2c, 0x20, 0x75, 0x73, 0x65, 0x53, 0x69, 0x67,
-  0x6e, 0x61, 0x6c, 0x7d, 0x20, 0x66, 0x72, 0x6f, 0x6d, 0x20, 0x27, 0x2f,
-  0x69, 0x6e, 0x64, 0x65, 0x78, 0x2e, 0x6a, 0x73, 0x27, 0x3b, 0x0a, 0x0a,
-  0x20, 0x20, 0x20, 0x20, 0x69, 0x6d, 0x70, 0x6f, 0x72, 0x74, 0x20, 0x7b,
-  0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x7d, 0x20, 0x66, 0x72, 0x6f, 0x6d, 0x20,
-  0x27, 0x2f, 0x63, 0x6f, 0x6d, 0x70, 0x6c, 0x65, 0x74, 0x69, 0x6f, 0x6e,
-  0x2e, 0x6a, 0x73, 0x27, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x69, 0x6d,
-  0x70, 0x6f, 0x72, 0x74, 0x20, 0x7b, 0x53, 0x63, 0x68, 0x65, 0x6d, 0x61,
-  0x43, 0x6f, 0x6e, 0x76, 0x65, 0x72, 0x74, 0x65, 0x72, 0x7d, 0x20, 0x66,
-  0x72, 0x6f, 0x6d, 0x20, 0x27, 0x2f, 0x6a, 0x73, 0x6f, 0x6e, 0x2d, 0x73,
-  0x63, 0x68, 0x65, 0x6d, 0x61, 0x2d, 0x74, 0x6f, 0x2d, 0x67, 0x72, 0x61,
-  0x6d, 0x6d, 0x61, 0x72, 0x2e, 0x6d, 0x6a, 0x73, 0x27, 0x3b, 0x0a, 0x0a,
-  0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x73, 0x65,
-  0x73, 0x73, 0x69, 0x6f, 0x6e, 0x20, 0x3d, 0x20, 0x73, 0x69, 0x67, 0x6e,
-  0x61, 0x6c, 0x28, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x70,
-  0x72, 0x6f, 0x6d, 0x70, 0x74, 0x3a, 0x20, 0x22, 0x54, 0x68, 0x69, 0x73,
-  0x20, 0x69, 0x73, 0x20, 0x61, 0x20, 0x63, 0x6f, 0x6e, 0x76, 0x65, 0x72,
-  0x73, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x62, 0x65, 0x74, 0x77, 0x65,
-  0x65, 0x6e, 0x20, 0x55, 0x73, 0x65, 0x72, 0x20, 0x61, 0x6e, 0x64, 0x20,
-  0x4c, 0x6c, 0x61, 0x6d, 0x61, 0x2c, 0x20, 0x61, 0x20, 0x66, 0x72, 0x69,
-  0x65, 0x6e, 0x64, 0x6c, 0x79, 0x20, 0x63, 0x68, 0x61, 0x74, 0x62, 0x6f,
-  0x74, 0x2e, 0x20, 0x4c, 0x6c, 0x61, 0x6d, 0x61, 0x20, 0x69, 0x73, 0x20,
-  0x68, 0x65, 0x6c, 0x70, 0x66, 0x75, 0x6c, 0x2c, 0x20, 0x6b, 0x69, 0x6e,
-  0x64, 0x2c, 0x20, 0x68, 0x6f, 0x6e, 0x65, 0x73, 0x74, 0x2c, 0x20, 0x67,
-  0x6f, 0x6f, 0x64, 0x20, 0x61, 0x74, 0x20, 0x77, 0x72, 0x69, 0x74, 0x69,
-  0x6e, 0x67, 0x2c, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x6e, 0x65, 0x76, 0x65,
-  0x72, 0x20, 0x66, 0x61, 0x69, 0x6c, 0x73, 0x20, 0x74, 0x6f, 0x20, 0x61,
-  0x6e, 0x73, 0x77, 0x65, 0x72, 0x20, 0x61, 0x6e, 0x79, 0x20, 0x72, 0x65,
-  0x71, 0x75, 0x65, 0x73, 0x74, 0x73, 0x20, 0x69, 0x6d, 0x6d, 0x65, 0x64,
-  0x69, 0x61, 0x74, 0x65, 0x6c, 0x79, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x77,
-  0x69, 0x74, 0x68, 0x20, 0x70, 0x72, 0x65, 0x63, 0x69, 0x73, 0x69, 0x6f,
-  0x6e, 0x2e, 0x22, 0x2c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x74,
-  0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x3a, 0x20, 0x22, 0x7b, 0x7b,
-  0x70, 0x72, 0x6f, 0x6d, 0x70, 0x74, 0x7d, 0x7d, 0x5c, 0x6e, 0x5c, 0x6e,
-  0x7b, 0x7b, 0x68, 0x69, 0x73, 0x74, 0x6f, 0x72, 0x79, 0x7d, 0x7d, 0x5c,
-  0x6e, 0x7b, 0x7b, 0x63, 0x68, 0x61, 0x72, 0x7d, 0x7d, 0x3a, 0x22, 0x2c,
-  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x68, 0x69, 0x73, 0x74, 0x6f,
-  0x72, 0x79, 0x54, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x3a, 0x20,
-  0x22, 0x7b, 0x7b, 0x6e, 0x61, 0x6d, 0x65, 0x7d, 0x7d, 0x3a, 0x20, 0x7b,
-  0x7b, 0x6d, 0x65, 0x73, 0x73, 0x61, 0x67, 0x65, 0x7d, 0x7d, 0x22, 0x2c,
-  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x74, 0x72, 0x61, 0x6e, 0x73,
-  0x63, 0x72, 0x69, 0x70, 0x74, 0x3a, 0x20, 0x5b, 0x5d, 0x2c, 0x0a, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x74, 0x79, 0x70, 0x65, 0x3a, 0x20, 0x22,
-  0x63, 0x68, 0x61, 0x74, 0x22, 0x2c, 0x20, 0x20, 0x2f, 0x2f, 0x20, 0x22,
-  0x63, 0x68, 0x61, 0x74, 0x22, 0x20, 0x7c, 0x20, 0x22, 0x63, 0x6f, 0x6d,
-  0x70, 0x6c, 0x65, 0x74, 0x69, 0x6f, 0x6e, 0x22, 0x0a, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x63, 0x68, 0x61, 0x72, 0x3a, 0x20, 0x22, 0x4c, 0x6c,
-  0x61, 0x6d, 0x61, 0x22, 0x2c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x75, 0x73, 0x65, 0x72, 0x3a, 0x20, 0x22, 0x55, 0x73, 0x65, 0x72, 0x22,
-  0x2c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x29, 0x0a, 0x0a, 0x20, 0x20,
-  0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x70, 0x61, 0x72, 0x61,
-  0x6d, 0x73, 0x20, 0x3d, 0x20, 0x73, 0x69, 0x67, 0x6e, 0x61, 0x6c, 0x28,
-  0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6e, 0x5f, 0x70, 0x72,
-  0x65, 0x64, 0x69, 0x63, 0x74, 0x3a, 0x20, 0x34, 0x30, 0x30, 0x2c, 0x0a,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x74, 0x65, 0x6d, 0x70, 0x65, 0x72,
-  0x61, 0x74, 0x75, 0x72, 0x65, 0x3a, 0x20, 0x30, 0x2e, 0x37, 0x2c, 0x0a,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x70, 0x65, 0x61, 0x74,
-  0x5f, 0x6c, 0x61, 0x73, 0x74, 0x5f, 0x6e, 0x3a, 0x20, 0x32, 0x35, 0x36,
-  0x2c, 0x20, 0x2f, 0x2f, 0x20, 0x30, 0x20, 0x3d, 0x20, 0x64, 0x69, 0x73,
-  0x61, 0x62, 0x6c, 0x65, 0x20, 0x70, 0x65, 0x6e, 0x61, 0x6c, 0x74, 0x79,
-  0x2c, 0x20, 0x2d, 0x31, 0x20, 0x3d, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x65,
-  0x78, 0x74, 0x20, 0x73, 0x69, 0x7a, 0x65, 0x0a, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x72, 0x65, 0x70, 0x65, 0x61, 0x74, 0x5f, 0x70, 0x65, 0x6e,
-  0x61, 0x6c, 0x74, 0x79, 0x3a, 0x20, 0x31, 0x2e, 0x31, 0x38, 0x2c, 0x20,
-  0x2f, 0x2f, 0x20, 0x31, 0x2e, 0x30, 0x20, 0x3d, 0x20, 0x64, 0x69, 0x73,
+  0x72, 0x2d, 0x31, 0x29, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x61, 0x6e, 0x69, 0x6d, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x3a, 0x20,
+  0x6c, 0x6f, 0x61, 0x64, 0x69, 0x6e, 0x67, 0x2d, 0x62, 0x67, 0x2d, 0x77,
+  0x69, 0x70, 0x65, 0x20, 0x32, 0x73, 0x20, 0x6c, 0x69, 0x6e, 0x65, 0x61,
+  0x72, 0x20, 0x69, 0x6e, 0x66, 0x69, 0x6e, 0x69, 0x74, 0x65, 0x3b, 0x0a,
+  0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x40,
+  0x6d, 0x65, 0x64, 0x69, 0x61, 0x20, 0x28, 0x70, 0x72, 0x65, 0x66, 0x65,
+  0x72, 0x73, 0x2d, 0x63, 0x6f, 0x6c, 0x6f, 0x72, 0x2d, 0x73, 0x63, 0x68,
+  0x65, 0x6d, 0x65, 0x3a, 0x20, 0x64, 0x61, 0x72, 0x6b, 0x29, 0x20, 0x7b,
+  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2e, 0x6c, 0x6f, 0x61, 0x64,
+  0x69, 0x6e, 0x67, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x2d, 0x2d, 0x6c, 0x6f, 0x61, 0x64, 0x69, 0x6e, 0x67, 0x2d,
+  0x63, 0x6f, 0x6c, 0x6f, 0x72, 0x2d, 0x31, 0x3a, 0x20, 0x23, 0x32, 0x32,
+  0x32, 0x32, 0x32, 0x32, 0x30, 0x30, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x2d, 0x2d, 0x6c, 0x6f, 0x61, 0x64, 0x69, 0x6e,
+  0x67, 0x2d, 0x63, 0x6f, 0x6c, 0x6f, 0x72, 0x2d, 0x32, 0x3a, 0x20, 0x23,
+  0x32, 0x32, 0x32, 0x32, 0x32, 0x32, 0x66, 0x66, 0x3b, 0x0a, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x2e, 0x70, 0x6f, 0x70, 0x6f, 0x76, 0x65, 0x72, 0x2d, 0x63, 0x6f,
+  0x6e, 0x74, 0x65, 0x6e, 0x74, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x62, 0x61, 0x63, 0x6b, 0x67, 0x72, 0x6f, 0x75,
+  0x6e, 0x64, 0x2d, 0x63, 0x6f, 0x6c, 0x6f, 0x72, 0x3a, 0x20, 0x62, 0x6c,
+  0x61, 0x63, 0x6b, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d,
+  0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x3c, 0x2f, 0x73,
+  0x74, 0x79, 0x6c, 0x65, 0x3e, 0x0a, 0x0a, 0x20, 0x20, 0x3c, 0x73, 0x63,
+  0x72, 0x69, 0x70, 0x74, 0x20, 0x74, 0x79, 0x70, 0x65, 0x3d, 0x22, 0x6d,
+  0x6f, 0x64, 0x75, 0x6c, 0x65, 0x22, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20,
+  0x69, 0x6d, 0x70, 0x6f, 0x72, 0x74, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x68, 0x74, 0x6d, 0x6c, 0x2c, 0x20, 0x68, 0x2c, 0x20,
+  0x73, 0x69, 0x67, 0x6e, 0x61, 0x6c, 0x2c, 0x20, 0x65, 0x66, 0x66, 0x65,
+  0x63, 0x74, 0x2c, 0x20, 0x63, 0x6f, 0x6d, 0x70, 0x75, 0x74, 0x65, 0x64,
+  0x2c, 0x20, 0x72, 0x65, 0x6e, 0x64, 0x65, 0x72, 0x2c, 0x20, 0x75, 0x73,
+  0x65, 0x53, 0x69, 0x67, 0x6e, 0x61, 0x6c, 0x2c, 0x20, 0x75, 0x73, 0x65,
+  0x45, 0x66, 0x66, 0x65, 0x63, 0x74, 0x2c, 0x20, 0x75, 0x73, 0x65, 0x52,
+  0x65, 0x66, 0x2c, 0x20, 0x43, 0x6f, 0x6d, 0x70, 0x6f, 0x6e, 0x65, 0x6e,
+  0x74, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x20, 0x66, 0x72, 0x6f, 0x6d,
+  0x20, 0x27, 0x2f, 0x69, 0x6e, 0x64, 0x65, 0x78, 0x2e, 0x6a, 0x73, 0x27,
+  0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x69, 0x6d, 0x70, 0x6f, 0x72,
+  0x74, 0x20, 0x7b, 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x20, 0x7d, 0x20,
+  0x66, 0x72, 0x6f, 0x6d, 0x20, 0x27, 0x2f, 0x63, 0x6f, 0x6d, 0x70, 0x6c,
+  0x65, 0x74, 0x69, 0x6f, 0x6e, 0x2e, 0x6a, 0x73, 0x27, 0x3b, 0x0a, 0x20,
+  0x20, 0x20, 0x20, 0x69, 0x6d, 0x70, 0x6f, 0x72, 0x74, 0x20, 0x7b, 0x20,
+  0x53, 0x63, 0x68, 0x65, 0x6d, 0x61, 0x43, 0x6f, 0x6e, 0x76, 0x65, 0x72,
+  0x74, 0x65, 0x72, 0x20, 0x7d, 0x20, 0x66, 0x72, 0x6f, 0x6d, 0x20, 0x27,
+  0x2f, 0x6a, 0x73, 0x6f, 0x6e, 0x2d, 0x73, 0x63, 0x68, 0x65, 0x6d, 0x61,
+  0x2d, 0x74, 0x6f, 0x2d, 0x67, 0x72, 0x61, 0x6d, 0x6d, 0x61, 0x72, 0x2e,
+  0x6d, 0x6a, 0x73, 0x27, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x6c, 0x65,
+  0x74, 0x20, 0x73, 0x65, 0x6c, 0x65, 0x63, 0x74, 0x65, 0x64, 0x5f, 0x69,
+  0x6d, 0x61, 0x67, 0x65, 0x20, 0x3d, 0x20, 0x66, 0x61, 0x6c, 0x73, 0x65,
+  0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x76, 0x61, 0x72, 0x20, 0x73, 0x6c,
+  0x6f, 0x74, 0x5f, 0x69, 0x64, 0x20, 0x3d, 0x20, 0x2d, 0x31, 0x3b, 0x0a,
+  0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x73,
+  0x65, 0x73, 0x73, 0x69, 0x6f, 0x6e, 0x20, 0x3d, 0x20, 0x73, 0x69, 0x67,
+  0x6e, 0x61, 0x6c, 0x28, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x70, 0x72, 0x6f, 0x6d, 0x70, 0x74, 0x3a, 0x20, 0x22, 0x54, 0x68, 0x69,
+  0x73, 0x20, 0x69, 0x73, 0x20, 0x61, 0x20, 0x63, 0x6f, 0x6e, 0x76, 0x65,
+  0x72, 0x73, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x62, 0x65, 0x74, 0x77,
+  0x65, 0x65, 0x6e, 0x20, 0x55, 0x73, 0x65, 0x72, 0x20, 0x61, 0x6e, 0x64,
+  0x20, 0x4c, 0x6c, 0x61, 0x6d, 0x61, 0x2c, 0x20, 0x61, 0x20, 0x66, 0x72,
+  0x69, 0x65, 0x6e, 0x64, 0x6c, 0x79, 0x20, 0x63, 0x68, 0x61, 0x74, 0x62,
+  0x6f, 0x74, 0x2e, 0x20, 0x4c, 0x6c, 0x61, 0x6d, 0x61, 0x20, 0x69, 0x73,
+  0x20, 0x68, 0x65, 0x6c, 0x70, 0x66, 0x75, 0x6c, 0x2c, 0x20, 0x6b, 0x69,
+  0x6e, 0x64, 0x2c, 0x20, 0x68, 0x6f, 0x6e, 0x65, 0x73, 0x74, 0x2c, 0x20,
+  0x67, 0x6f, 0x6f, 0x64, 0x20, 0x61, 0x74, 0x20, 0x77, 0x72, 0x69, 0x74,
+  0x69, 0x6e, 0x67, 0x2c, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x6e, 0x65, 0x76,
+  0x65, 0x72, 0x20, 0x66, 0x61, 0x69, 0x6c, 0x73, 0x20, 0x74, 0x6f, 0x20,
+  0x61, 0x6e, 0x73, 0x77, 0x65, 0x72, 0x20, 0x61, 0x6e, 0x79, 0x20, 0x72,
+  0x65, 0x71, 0x75, 0x65, 0x73, 0x74, 0x73, 0x20, 0x69, 0x6d, 0x6d, 0x65,
+  0x64, 0x69, 0x61, 0x74, 0x65, 0x6c, 0x79, 0x20, 0x61, 0x6e, 0x64, 0x20,
+  0x77, 0x69, 0x74, 0x68, 0x20, 0x70, 0x72, 0x65, 0x63, 0x69, 0x73, 0x69,
+  0x6f, 0x6e, 0x2e, 0x22, 0x2c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x74, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x3a, 0x20, 0x22, 0x7b,
+  0x7b, 0x70, 0x72, 0x6f, 0x6d, 0x70, 0x74, 0x7d, 0x7d, 0x5c, 0x6e, 0x5c,
+  0x6e, 0x7b, 0x7b, 0x68, 0x69, 0x73, 0x74, 0x6f, 0x72, 0x79, 0x7d, 0x7d,
+  0x5c, 0x6e, 0x7b, 0x7b, 0x63, 0x68, 0x61, 0x72, 0x7d, 0x7d, 0x3a, 0x22,
+  0x2c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x68, 0x69, 0x73, 0x74,
+  0x6f, 0x72, 0x79, 0x54, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x3a,
+  0x20, 0x22, 0x7b, 0x7b, 0x6e, 0x61, 0x6d, 0x65, 0x7d, 0x7d, 0x3a, 0x20,
+  0x7b, 0x7b, 0x6d, 0x65, 0x73, 0x73, 0x61, 0x67, 0x65, 0x7d, 0x7d, 0x22,
+  0x2c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x74, 0x72, 0x61, 0x6e,
+  0x73, 0x63, 0x72, 0x69, 0x70, 0x74, 0x3a, 0x20, 0x5b, 0x5d, 0x2c, 0x0a,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x74, 0x79, 0x70, 0x65, 0x3a, 0x20,
+  0x22, 0x63, 0x68, 0x61, 0x74, 0x22, 0x2c, 0x20, 0x20, 0x2f, 0x2f, 0x20,
+  0x22, 0x63, 0x68, 0x61, 0x74, 0x22, 0x20, 0x7c, 0x20, 0x22, 0x63, 0x6f,
+  0x6d, 0x70, 0x6c, 0x65, 0x74, 0x69, 0x6f, 0x6e, 0x22, 0x0a, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x63, 0x68, 0x61, 0x72, 0x3a, 0x20, 0x22, 0x4c,
+  0x6c, 0x61, 0x6d, 0x61, 0x22, 0x2c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x75, 0x73, 0x65, 0x72, 0x3a, 0x20, 0x22, 0x55, 0x73, 0x65, 0x72,
+  0x22, 0x2c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x6d, 0x61,
+  0x67, 0x65, 0x5f, 0x73, 0x65, 0x6c, 0x65, 0x63, 0x74, 0x65, 0x64, 0x3a,
+  0x20, 0x27, 0x27, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x29, 0x0a, 0x0a,
+  0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x70, 0x61,
+  0x72, 0x61, 0x6d, 0x73, 0x20, 0x3d, 0x20, 0x73, 0x69, 0x67, 0x6e, 0x61,
+  0x6c, 0x28, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6e, 0x5f,
+  0x70, 0x72, 0x65, 0x64, 0x69, 0x63, 0x74, 0x3a, 0x20, 0x34, 0x30, 0x30,
+  0x2c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x74, 0x65, 0x6d, 0x70,
+  0x65, 0x72, 0x61, 0x74, 0x75, 0x72, 0x65, 0x3a, 0x20, 0x30, 0x2e, 0x37,
+  0x2c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x70, 0x65,
+  0x61, 0x74, 0x5f, 0x6c, 0x61, 0x73, 0x74, 0x5f, 0x6e, 0x3a, 0x20, 0x32,
+  0x35, 0x36, 0x2c, 0x20, 0x2f, 0x2f, 0x20, 0x30, 0x20, 0x3d, 0x20, 0x64,
+  0x69, 0x73, 0x61, 0x62, 0x6c, 0x65, 0x20, 0x70, 0x65, 0x6e, 0x61, 0x6c,
+  0x74, 0x79, 0x2c, 0x20, 0x2d, 0x31, 0x20, 0x3d, 0x20, 0x63, 0x6f, 0x6e,
+  0x74, 0x65, 0x78, 0x74, 0x20, 0x73, 0x69, 0x7a, 0x65, 0x0a, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x70, 0x65, 0x61, 0x74, 0x5f, 0x70,
+  0x65, 0x6e, 0x61, 0x6c, 0x74, 0x79, 0x3a, 0x20, 0x31, 0x2e, 0x31, 0x38,
+  0x2c, 0x20, 0x2f, 0x2f, 0x20, 0x31, 0x2e, 0x30, 0x20, 0x3d, 0x20, 0x64,
+  0x69, 0x73, 0x61, 0x62, 0x6c, 0x65, 0x64, 0x0a, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x74, 0x6f, 0x70, 0x5f, 0x6b, 0x3a, 0x20, 0x34, 0x30, 0x2c,
+  0x20, 0x2f, 0x2f, 0x20, 0x3c, 0x3d, 0x20, 0x30, 0x20, 0x74, 0x6f, 0x20,
+  0x75, 0x73, 0x65, 0x20, 0x76, 0x6f, 0x63, 0x61, 0x62, 0x20, 0x73, 0x69,
+  0x7a, 0x65, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x74, 0x6f, 0x70,
+  0x5f, 0x70, 0x3a, 0x20, 0x30, 0x2e, 0x35, 0x2c, 0x20, 0x2f, 0x2f, 0x20,
+  0x31, 0x2e, 0x30, 0x20, 0x3d, 0x20, 0x64, 0x69, 0x73, 0x61, 0x62, 0x6c,
+  0x65, 0x64, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x74, 0x66, 0x73,
+  0x5f, 0x7a, 0x3a, 0x20, 0x31, 0x2e, 0x30, 0x2c, 0x20, 0x2f, 0x2f, 0x20,
+  0x31, 0x2e, 0x30, 0x20, 0x3d, 0x20, 0x64, 0x69, 0x73, 0x61, 0x62, 0x6c,
+  0x65, 0x64, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x74, 0x79, 0x70,
+  0x69, 0x63, 0x61, 0x6c, 0x5f, 0x70, 0x3a, 0x20, 0x31, 0x2e, 0x30, 0x2c,
+  0x20, 0x2f, 0x2f, 0x20, 0x31, 0x2e, 0x30, 0x20, 0x3d, 0x20, 0x64, 0x69,
+  0x73, 0x61, 0x62, 0x6c, 0x65, 0x64, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x70, 0x72, 0x65, 0x73, 0x65, 0x6e, 0x63, 0x65, 0x5f, 0x70, 0x65,
+  0x6e, 0x61, 0x6c, 0x74, 0x79, 0x3a, 0x20, 0x30, 0x2e, 0x30, 0x2c, 0x20,
+  0x2f, 0x2f, 0x20, 0x30, 0x2e, 0x30, 0x20, 0x3d, 0x20, 0x64, 0x69, 0x73,
   0x61, 0x62, 0x6c, 0x65, 0x64, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x74, 0x6f, 0x70, 0x5f, 0x6b, 0x3a, 0x20, 0x34, 0x30, 0x2c, 0x20, 0x2f,
-  0x2f, 0x20, 0x3c, 0x3d, 0x20, 0x30, 0x20, 0x74, 0x6f, 0x20, 0x75, 0x73,
-  0x65, 0x20, 0x76, 0x6f, 0x63, 0x61, 0x62, 0x20, 0x73, 0x69, 0x7a, 0x65,
-  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x74, 0x6f, 0x70, 0x5f, 0x70,
-  0x3a, 0x20, 0x30, 0x2e, 0x35, 0x2c, 0x20, 0x2f, 0x2f, 0x20, 0x31, 0x2e,
-  0x30, 0x20, 0x3d, 0x20, 0x64, 0x69, 0x73, 0x61, 0x62, 0x6c, 0x65, 0x64,
-  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x74, 0x66, 0x73, 0x5f, 0x7a,
-  0x3a, 0x20, 0x31, 0x2e, 0x30, 0x2c, 0x20, 0x2f, 0x2f, 0x20, 0x31, 0x2e,
-  0x30, 0x20, 0x3d, 0x20, 0x64, 0x69, 0x73, 0x61, 0x62, 0x6c, 0x65, 0x64,
-  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x74, 0x79, 0x70, 0x69, 0x63,
-  0x61, 0x6c, 0x5f, 0x70, 0x3a, 0x20, 0x31, 0x2e, 0x30, 0x2c, 0x20, 0x2f,
-  0x2f, 0x20, 0x31, 0x2e, 0x30, 0x20, 0x3d, 0x20, 0x64, 0x69, 0x73, 0x61,
-  0x62, 0x6c, 0x65, 0x64, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x70,
-  0x72, 0x65, 0x73, 0x65, 0x6e, 0x63, 0x65, 0x5f, 0x70, 0x65, 0x6e, 0x61,
-  0x6c, 0x74, 0x79, 0x3a, 0x20, 0x30, 0x2e, 0x30, 0x2c, 0x20, 0x2f, 0x2f,
-  0x20, 0x30, 0x2e, 0x30, 0x20, 0x3d, 0x20, 0x64, 0x69, 0x73, 0x61, 0x62,
-  0x6c, 0x65, 0x64, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x66, 0x72,
-  0x65, 0x71, 0x75, 0x65, 0x6e, 0x63, 0x79, 0x5f, 0x70, 0x65, 0x6e, 0x61,
-  0x6c, 0x74, 0x79, 0x3a, 0x20, 0x30, 0x2e, 0x30, 0x2c, 0x20, 0x2f, 0x2f,
-  0x20, 0x30, 0x2e, 0x30, 0x20, 0x3d, 0x20, 0x64, 0x69, 0x73, 0x61, 0x62,
-  0x6c, 0x65, 0x64, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6d, 0x69,
-  0x72, 0x6f, 0x73, 0x74, 0x61, 0x74, 0x3a, 0x20, 0x30, 0x2c, 0x20, 0x2f,
-  0x2f, 0x20, 0x30, 0x2f, 0x31, 0x2f, 0x32, 0x0a, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x6d, 0x69, 0x72, 0x6f, 0x73, 0x74, 0x61, 0x74, 0x5f, 0x74,
-  0x61, 0x75, 0x3a, 0x20, 0x35, 0x2c, 0x20, 0x2f, 0x2f, 0x20, 0x74, 0x61,
-  0x72, 0x67, 0x65, 0x74, 0x20, 0x65, 0x6e, 0x74, 0x72, 0x6f, 0x70, 0x79,
-  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6d, 0x69, 0x72, 0x6f, 0x73,
-  0x74, 0x61, 0x74, 0x5f, 0x65, 0x74, 0x61, 0x3a, 0x20, 0x30, 0x2e, 0x31,
-  0x2c, 0x20, 0x2f, 0x2f, 0x20, 0x6c, 0x65, 0x61, 0x72, 0x6e, 0x69, 0x6e,
-  0x67, 0x20, 0x72, 0x61, 0x74, 0x65, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x67, 0x72, 0x61, 0x6d, 0x6d, 0x61, 0x72, 0x3a, 0x20, 0x27, 0x27,
-  0x2c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6e, 0x5f, 0x70, 0x72,
-  0x6f, 0x62, 0x73, 0x3a, 0x20, 0x30, 0x2c, 0x20, 0x2f, 0x2f, 0x20, 0x6e,
-  0x6f, 0x20, 0x63, 0x6f, 0x6d, 0x70, 0x6c, 0x65, 0x74, 0x69, 0x6f, 0x6e,
-  0x5f, 0x70, 0x72, 0x6f, 0x62, 0x61, 0x62, 0x69, 0x6c, 0x69, 0x74, 0x69,
-  0x65, 0x73, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x29, 0x0a, 0x0a, 0x20,
-  0x20, 0x20, 0x20, 0x2f, 0x2a, 0x20, 0x53, 0x54, 0x41, 0x52, 0x54, 0x3a,
-  0x20, 0x53, 0x75, 0x70, 0x70, 0x6f, 0x72, 0x74, 0x20, 0x66, 0x6f, 0x72,
-  0x20, 0x73, 0x74, 0x6f, 0x72, 0x69, 0x6e, 0x67, 0x20, 0x70, 0x72, 0x6f,
-  0x6d, 0x70, 0x74, 0x20, 0x74, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65,
-  0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x65,
-  0x74, 0x65, 0x72, 0x73, 0x20, 0x69, 0x6e, 0x20, 0x62, 0x6f, 0x72, 0x77,
-  0x73, 0x65, 0x72, 0x20, 0x4c, 0x6f, 0x63, 0x61, 0x6c, 0x53, 0x74, 0x6f,
-  0x72, 0x61, 0x67, 0x65, 0x20, 0x2a, 0x2f, 0x0a, 0x0a, 0x20, 0x20, 0x20,
-  0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x6c, 0x6f, 0x63, 0x61, 0x6c,
-  0x5f, 0x73, 0x74, 0x6f, 0x72, 0x61, 0x67, 0x65, 0x5f, 0x73, 0x74, 0x6f,
-  0x72, 0x61, 0x67, 0x65, 0x4b, 0x65, 0x79, 0x20, 0x3d, 0x20, 0x22, 0x6c,
-  0x6c, 0x61, 0x6d, 0x61, 0x63, 0x70, 0x70, 0x5f, 0x73, 0x65, 0x72, 0x76,
-  0x65, 0x72, 0x5f, 0x6c, 0x6f, 0x63, 0x61, 0x6c, 0x5f, 0x73, 0x74, 0x6f,
-  0x72, 0x61, 0x67, 0x65, 0x22, 0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20,
-  0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x6c, 0x6f, 0x63,
+  0x66, 0x72, 0x65, 0x71, 0x75, 0x65, 0x6e, 0x63, 0x79, 0x5f, 0x70, 0x65,
+  0x6e, 0x61, 0x6c, 0x74, 0x79, 0x3a, 0x20, 0x30, 0x2e, 0x30, 0x2c, 0x20,
+  0x2f, 0x2f, 0x20, 0x30, 0x2e, 0x30, 0x20, 0x3d, 0x20, 0x64, 0x69, 0x73,
+  0x61, 0x62, 0x6c, 0x65, 0x64, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x6d, 0x69, 0x72, 0x6f, 0x73, 0x74, 0x61, 0x74, 0x3a, 0x20, 0x30, 0x2c,
+  0x20, 0x2f, 0x2f, 0x20, 0x30, 0x2f, 0x31, 0x2f, 0x32, 0x0a, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x6d, 0x69, 0x72, 0x6f, 0x73, 0x74, 0x61, 0x74,
+  0x5f, 0x74, 0x61, 0x75, 0x3a, 0x20, 0x35, 0x2c, 0x20, 0x2f, 0x2f, 0x20,
+  0x74, 0x61, 0x72, 0x67, 0x65, 0x74, 0x20, 0x65, 0x6e, 0x74, 0x72, 0x6f,
+  0x70, 0x79, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6d, 0x69, 0x72,
+  0x6f, 0x73, 0x74, 0x61, 0x74, 0x5f, 0x65, 0x74, 0x61, 0x3a, 0x20, 0x30,
+  0x2e, 0x31, 0x2c, 0x20, 0x2f, 0x2f, 0x20, 0x6c, 0x65, 0x61, 0x72, 0x6e,
+  0x69, 0x6e, 0x67, 0x20, 0x72, 0x61, 0x74, 0x65, 0x0a, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x67, 0x72, 0x61, 0x6d, 0x6d, 0x61, 0x72, 0x3a, 0x20,
+  0x27, 0x27, 0x2c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6e, 0x5f,
+  0x70, 0x72, 0x6f, 0x62, 0x73, 0x3a, 0x20, 0x30, 0x2c, 0x20, 0x2f, 0x2f,
+  0x20, 0x6e, 0x6f, 0x20, 0x63, 0x6f, 0x6d, 0x70, 0x6c, 0x65, 0x74, 0x69,
+  0x6f, 0x6e, 0x5f, 0x70, 0x72, 0x6f, 0x62, 0x61, 0x62, 0x69, 0x6c, 0x69,
+  0x74, 0x69, 0x65, 0x73, 0x2c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x69, 0x6d, 0x61, 0x67, 0x65, 0x5f, 0x64, 0x61, 0x74, 0x61, 0x3a, 0x20,
+  0x5b, 0x5d, 0x2c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x61,
+  0x63, 0x68, 0x65, 0x5f, 0x70, 0x72, 0x6f, 0x6d, 0x70, 0x74, 0x3a, 0x20,
+  0x74, 0x72, 0x75, 0x65, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x29, 0x0a,
+  0x0a, 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2a, 0x20, 0x53, 0x54, 0x41, 0x52,
+  0x54, 0x3a, 0x20, 0x53, 0x75, 0x70, 0x70, 0x6f, 0x72, 0x74, 0x20, 0x66,
+  0x6f, 0x72, 0x20, 0x73, 0x74, 0x6f, 0x72, 0x69, 0x6e, 0x67, 0x20, 0x70,
+  0x72, 0x6f, 0x6d, 0x70, 0x74, 0x20, 0x74, 0x65, 0x6d, 0x70, 0x6c, 0x61,
+  0x74, 0x65, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x70, 0x61, 0x72, 0x61,
+  0x6d, 0x65, 0x74, 0x65, 0x72, 0x73, 0x20, 0x69, 0x6e, 0x20, 0x62, 0x6f,
+  0x72, 0x77, 0x73, 0x65, 0x72, 0x20, 0x4c, 0x6f, 0x63, 0x61, 0x6c, 0x53,
+  0x74, 0x6f, 0x72, 0x61, 0x67, 0x65, 0x20, 0x2a, 0x2f, 0x0a, 0x0a, 0x20,
+  0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x6c, 0x6f, 0x63,
   0x61, 0x6c, 0x5f, 0x73, 0x74, 0x6f, 0x72, 0x61, 0x67, 0x65, 0x5f, 0x73,
-  0x65, 0x74, 0x44, 0x61, 0x74, 0x61, 0x46, 0x72, 0x6f, 0x6d, 0x4f, 0x62,
-  0x6a, 0x65, 0x63, 0x74, 0x28, 0x74, 0x61, 0x67, 0x2c, 0x20, 0x63, 0x6f,
-  0x6e, 0x74, 0x65, 0x6e, 0x74, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x6c, 0x6f, 0x63, 0x61, 0x6c, 0x53, 0x74, 0x6f, 0x72,
-  0x61, 0x67, 0x65, 0x2e, 0x73, 0x65, 0x74, 0x49, 0x74, 0x65, 0x6d, 0x28,
+  0x74, 0x6f, 0x72, 0x61, 0x67, 0x65, 0x4b, 0x65, 0x79, 0x20, 0x3d, 0x20,
+  0x22, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x63, 0x70, 0x70, 0x5f, 0x73, 0x65,
+  0x72, 0x76, 0x65, 0x72, 0x5f, 0x6c, 0x6f, 0x63, 0x61, 0x6c, 0x5f, 0x73,
+  0x74, 0x6f, 0x72, 0x61, 0x67, 0x65, 0x22, 0x3b, 0x0a, 0x0a, 0x20, 0x20,
+  0x20, 0x20, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x6c,
+  0x6f, 0x63, 0x61, 0x6c, 0x5f, 0x73, 0x74, 0x6f, 0x72, 0x61, 0x67, 0x65,
+  0x5f, 0x73, 0x65, 0x74, 0x44, 0x61, 0x74, 0x61, 0x46, 0x72, 0x6f, 0x6d,
+  0x4f, 0x62, 0x6a, 0x65, 0x63, 0x74, 0x28, 0x74, 0x61, 0x67, 0x2c, 0x20,
+  0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x29, 0x20, 0x7b, 0x0a, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x6c, 0x6f, 0x63, 0x61, 0x6c, 0x53, 0x74,
+  0x6f, 0x72, 0x61, 0x67, 0x65, 0x2e, 0x73, 0x65, 0x74, 0x49, 0x74, 0x65,
+  0x6d, 0x28, 0x6c, 0x6f, 0x63, 0x61, 0x6c, 0x5f, 0x73, 0x74, 0x6f, 0x72,
+  0x61, 0x67, 0x65, 0x5f, 0x73, 0x74, 0x6f, 0x72, 0x61, 0x67, 0x65, 0x4b,
+  0x65, 0x79, 0x20, 0x2b, 0x20, 0x27, 0x2f, 0x27, 0x20, 0x2b, 0x20, 0x74,
+  0x61, 0x67, 0x2c, 0x20, 0x4a, 0x53, 0x4f, 0x4e, 0x2e, 0x73, 0x74, 0x72,
+  0x69, 0x6e, 0x67, 0x69, 0x66, 0x79, 0x28, 0x63, 0x6f, 0x6e, 0x74, 0x65,
+  0x6e, 0x74, 0x29, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a,
+  0x0a, 0x20, 0x20, 0x20, 0x20, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f,
+  0x6e, 0x20, 0x6c, 0x6f, 0x63, 0x61, 0x6c, 0x5f, 0x73, 0x74, 0x6f, 0x72,
+  0x61, 0x67, 0x65, 0x5f, 0x73, 0x65, 0x74, 0x44, 0x61, 0x74, 0x61, 0x46,
+  0x72, 0x6f, 0x6d, 0x52, 0x61, 0x77, 0x54, 0x65, 0x78, 0x74, 0x28, 0x74,
+  0x61, 0x67, 0x2c, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x29,
+  0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6c, 0x6f, 0x63,
+  0x61, 0x6c, 0x53, 0x74, 0x6f, 0x72, 0x61, 0x67, 0x65, 0x2e, 0x73, 0x65,
+  0x74, 0x49, 0x74, 0x65, 0x6d, 0x28, 0x6c, 0x6f, 0x63, 0x61, 0x6c, 0x5f,
+  0x73, 0x74, 0x6f, 0x72, 0x61, 0x67, 0x65, 0x5f, 0x73, 0x74, 0x6f, 0x72,
+  0x61, 0x67, 0x65, 0x4b, 0x65, 0x79, 0x20, 0x2b, 0x20, 0x27, 0x2f, 0x27,
+  0x20, 0x2b, 0x20, 0x74, 0x61, 0x67, 0x2c, 0x20, 0x63, 0x6f, 0x6e, 0x74,
+  0x65, 0x6e, 0x74, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a,
+  0x0a, 0x20, 0x20, 0x20, 0x20, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f,
+  0x6e, 0x20, 0x6c, 0x6f, 0x63, 0x61, 0x6c, 0x5f, 0x73, 0x74, 0x6f, 0x72,
+  0x61, 0x67, 0x65, 0x5f, 0x67, 0x65, 0x74, 0x44, 0x61, 0x74, 0x61, 0x41,
+  0x73, 0x4f, 0x62, 0x6a, 0x65, 0x63, 0x74, 0x28, 0x74, 0x61, 0x67, 0x29,
+  0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e,
+  0x73, 0x74, 0x20, 0x69, 0x74, 0x65, 0x6d, 0x20, 0x3d, 0x20, 0x6c, 0x6f,
+  0x63, 0x61, 0x6c, 0x53, 0x74, 0x6f, 0x72, 0x61, 0x67, 0x65, 0x2e, 0x67,
+  0x65, 0x74, 0x49, 0x74, 0x65, 0x6d, 0x28, 0x6c, 0x6f, 0x63, 0x61, 0x6c,
+  0x5f, 0x73, 0x74, 0x6f, 0x72, 0x61, 0x67, 0x65, 0x5f, 0x73, 0x74, 0x6f,
+  0x72, 0x61, 0x67, 0x65, 0x4b, 0x65, 0x79, 0x20, 0x2b, 0x20, 0x27, 0x2f,
+  0x27, 0x20, 0x2b, 0x20, 0x74, 0x61, 0x67, 0x29, 0x3b, 0x0a, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x21, 0x69, 0x74, 0x65,
+  0x6d, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x6e, 0x75, 0x6c, 0x6c,
+  0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x20, 0x65, 0x6c,
+  0x73, 0x65, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x4a, 0x53, 0x4f, 0x4e,
+  0x2e, 0x70, 0x61, 0x72, 0x73, 0x65, 0x28, 0x69, 0x74, 0x65, 0x6d, 0x29,
+  0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20,
+  0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x66, 0x75, 0x6e,
+  0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x6c, 0x6f, 0x63, 0x61, 0x6c, 0x5f,
+  0x73, 0x74, 0x6f, 0x72, 0x61, 0x67, 0x65, 0x5f, 0x67, 0x65, 0x74, 0x44,
+  0x61, 0x74, 0x61, 0x41, 0x73, 0x52, 0x61, 0x77, 0x54, 0x65, 0x78, 0x74,
+  0x28, 0x74, 0x61, 0x67, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x69, 0x74, 0x65, 0x6d,
+  0x20, 0x3d, 0x20, 0x6c, 0x6f, 0x63, 0x61, 0x6c, 0x53, 0x74, 0x6f, 0x72,
+  0x61, 0x67, 0x65, 0x2e, 0x67, 0x65, 0x74, 0x49, 0x74, 0x65, 0x6d, 0x28,
   0x6c, 0x6f, 0x63, 0x61, 0x6c, 0x5f, 0x73, 0x74, 0x6f, 0x72, 0x61, 0x67,
   0x65, 0x5f, 0x73, 0x74, 0x6f, 0x72, 0x61, 0x67, 0x65, 0x4b, 0x65, 0x79,
   0x20, 0x2b, 0x20, 0x27, 0x2f, 0x27, 0x20, 0x2b, 0x20, 0x74, 0x61, 0x67,
-  0x2c, 0x20, 0x4a, 0x53, 0x4f, 0x4e, 0x2e, 0x73, 0x74, 0x72, 0x69, 0x6e,
-  0x67, 0x69, 0x66, 0x79, 0x28, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74,
-  0x29, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20,
-  0x20, 0x20, 0x20, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20,
-  0x6c, 0x6f, 0x63, 0x61, 0x6c, 0x5f, 0x73, 0x74, 0x6f, 0x72, 0x61, 0x67,
-  0x65, 0x5f, 0x73, 0x65, 0x74, 0x44, 0x61, 0x74, 0x61, 0x46, 0x72, 0x6f,
-  0x6d, 0x52, 0x61, 0x77, 0x54, 0x65, 0x78, 0x74, 0x28, 0x74, 0x61, 0x67,
-  0x2c, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x29, 0x20, 0x7b,
-  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6c, 0x6f, 0x63, 0x61, 0x6c,
-  0x53, 0x74, 0x6f, 0x72, 0x61, 0x67, 0x65, 0x2e, 0x73, 0x65, 0x74, 0x49,
-  0x74, 0x65, 0x6d, 0x28, 0x6c, 0x6f, 0x63, 0x61, 0x6c, 0x5f, 0x73, 0x74,
-  0x6f, 0x72, 0x61, 0x67, 0x65, 0x5f, 0x73, 0x74, 0x6f, 0x72, 0x61, 0x67,
-  0x65, 0x4b, 0x65, 0x79, 0x20, 0x2b, 0x20, 0x27, 0x2f, 0x27, 0x20, 0x2b,
-  0x20, 0x74, 0x61, 0x67, 0x2c, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e,
-  0x74, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20,
-  0x20, 0x20, 0x20, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20,
-  0x6c, 0x6f, 0x63, 0x61, 0x6c, 0x5f, 0x73, 0x74, 0x6f, 0x72, 0x61, 0x67,
-  0x65, 0x5f, 0x67, 0x65, 0x74, 0x44, 0x61, 0x74, 0x61, 0x41, 0x73, 0x4f,
-  0x62, 0x6a, 0x65, 0x63, 0x74, 0x28, 0x74, 0x61, 0x67, 0x29, 0x20, 0x7b,
-  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74,
-  0x20, 0x69, 0x74, 0x65, 0x6d, 0x20, 0x3d, 0x20, 0x6c, 0x6f, 0x63, 0x61,
-  0x6c, 0x53, 0x74, 0x6f, 0x72, 0x61, 0x67, 0x65, 0x2e, 0x67, 0x65, 0x74,
-  0x49, 0x74, 0x65, 0x6d, 0x28, 0x6c, 0x6f, 0x63, 0x61, 0x6c, 0x5f, 0x73,
-  0x74, 0x6f, 0x72, 0x61, 0x67, 0x65, 0x5f, 0x73, 0x74, 0x6f, 0x72, 0x61,
-  0x67, 0x65, 0x4b, 0x65, 0x79, 0x20, 0x2b, 0x20, 0x27, 0x2f, 0x27, 0x20,
-  0x2b, 0x20, 0x74, 0x61, 0x67, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x21, 0x69, 0x74, 0x65, 0x6d, 0x29,
-  0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72,
-  0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x6e, 0x75, 0x6c, 0x6c, 0x3b, 0x0a,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x20, 0x65, 0x6c, 0x73, 0x65,
-  0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72,
-  0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x4a, 0x53, 0x4f, 0x4e, 0x2e, 0x70,
-  0x61, 0x72, 0x73, 0x65, 0x28, 0x69, 0x74, 0x65, 0x6d, 0x29, 0x3b, 0x0a,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20,
-  0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x66, 0x75, 0x6e, 0x63, 0x74,
-  0x69, 0x6f, 0x6e, 0x20, 0x6c, 0x6f, 0x63, 0x61, 0x6c, 0x5f, 0x73, 0x74,
-  0x6f, 0x72, 0x61, 0x67, 0x65, 0x5f, 0x67, 0x65, 0x74, 0x44, 0x61, 0x74,
-  0x61, 0x41, 0x73, 0x52, 0x61, 0x77, 0x54, 0x65, 0x78, 0x74, 0x28, 0x74,
-  0x61, 0x67, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x69, 0x74, 0x65, 0x6d, 0x20, 0x3d,
-  0x20, 0x6c, 0x6f, 0x63, 0x61, 0x6c, 0x53, 0x74, 0x6f, 0x72, 0x61, 0x67,
-  0x65, 0x2e, 0x67, 0x65, 0x74, 0x49, 0x74, 0x65, 0x6d, 0x28, 0x6c, 0x6f,
-  0x63, 0x61, 0x6c, 0x5f, 0x73, 0x74, 0x6f, 0x72, 0x61, 0x67, 0x65, 0x5f,
-  0x73, 0x74, 0x6f, 0x72, 0x61, 0x67, 0x65, 0x4b, 0x65, 0x79, 0x20, 0x2b,
-  0x20, 0x27, 0x2f, 0x27, 0x20, 0x2b, 0x20, 0x74, 0x61, 0x67, 0x29, 0x3b,
-  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x21,
-  0x69, 0x74, 0x65, 0x6d, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x6e,
-  0x75, 0x6c, 0x6c, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d,
-  0x20, 0x65, 0x6c, 0x73, 0x65, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x69,
-  0x74, 0x65, 0x6d, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d,
-  0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20,
-  0x2f, 0x2f, 0x20, 0x63, 0x72, 0x65, 0x61, 0x74, 0x65, 0x20, 0x61, 0x20,
-  0x63, 0x6f, 0x6e, 0x74, 0x61, 0x69, 0x6e, 0x65, 0x72, 0x20, 0x66, 0x6f,
-  0x72, 0x20, 0x75, 0x73, 0x65, 0x72, 0x20, 0x74, 0x65, 0x6d, 0x70, 0x6c,
-  0x61, 0x74, 0x65, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x73, 0x65, 0x74,
-  0x74, 0x69, 0x6e, 0x67, 0x73, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63,
-  0x6f, 0x6e, 0x73, 0x74, 0x20, 0x73, 0x61, 0x76, 0x65, 0x64, 0x55, 0x73,
-  0x65, 0x72, 0x54, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x73, 0x20,
-  0x3d, 0x20, 0x73, 0x69, 0x67, 0x6e, 0x61, 0x6c, 0x28, 0x7b, 0x7d, 0x29,
-  0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x73,
-  0x65, 0x6c, 0x65, 0x63, 0x74, 0x65, 0x64, 0x55, 0x73, 0x65, 0x72, 0x54,
-  0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x20, 0x3d, 0x20, 0x73, 0x69,
-  0x67, 0x6e, 0x61, 0x6c, 0x28, 0x7b, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x3a,
-  0x20, 0x27, 0x27, 0x2c, 0x20, 0x74, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74,
-  0x65, 0x3a, 0x20, 0x7b, 0x20, 0x73, 0x65, 0x73, 0x73, 0x69, 0x6f, 0x6e,
-  0x3a, 0x20, 0x7b, 0x7d, 0x2c, 0x20, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73,
-  0x3a, 0x20, 0x7b, 0x7d, 0x20, 0x7d, 0x20, 0x7d, 0x29, 0x0a, 0x0a, 0x20,
-  0x20, 0x20, 0x20, 0x2f, 0x2f, 0x20, 0x6c, 0x65, 0x74, 0x27, 0x73, 0x20,
-  0x69, 0x6d, 0x70, 0x6f, 0x72, 0x74, 0x20, 0x6c, 0x6f, 0x63, 0x61, 0x6c,
-  0x6c, 0x79, 0x20, 0x73, 0x61, 0x76, 0x65, 0x64, 0x20, 0x74, 0x65, 0x6d,
+  0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20,
+  0x28, 0x21, 0x69, 0x74, 0x65, 0x6d, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e,
+  0x20, 0x6e, 0x75, 0x6c, 0x6c, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x7d, 0x20, 0x65, 0x6c, 0x73, 0x65, 0x20, 0x7b, 0x0a, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e,
+  0x20, 0x69, 0x74, 0x65, 0x6d, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20,
+  0x20, 0x20, 0x2f, 0x2f, 0x20, 0x63, 0x72, 0x65, 0x61, 0x74, 0x65, 0x20,
+  0x61, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x61, 0x69, 0x6e, 0x65, 0x72, 0x20,
+  0x66, 0x6f, 0x72, 0x20, 0x75, 0x73, 0x65, 0x72, 0x20, 0x74, 0x65, 0x6d,
   0x70, 0x6c, 0x61, 0x74, 0x65, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x73,
-  0x65, 0x74, 0x74, 0x69, 0x6e, 0x67, 0x73, 0x20, 0x69, 0x66, 0x20, 0x74,
-  0x68, 0x65, 0x72, 0x65, 0x20, 0x61, 0x72, 0x65, 0x20, 0x61, 0x6e, 0x79,
-  0x0a, 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f, 0x20, 0x75, 0x73, 0x65, 0x72,
-  0x20, 0x74, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x73, 0x20, 0x61,
-  0x6e, 0x64, 0x20, 0x73, 0x65, 0x74, 0x74, 0x69, 0x6e, 0x67, 0x73, 0x20,
-  0x61, 0x72, 0x65, 0x20, 0x73, 0x74, 0x6f, 0x72, 0x65, 0x64, 0x20, 0x69,
-  0x6e, 0x20, 0x6f, 0x6e, 0x65, 0x20, 0x6f, 0x62, 0x6a, 0x65, 0x63, 0x74,
-  0x0a, 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f, 0x20, 0x69, 0x6e, 0x20, 0x66,
-  0x6f, 0x72, 0x6d, 0x20, 0x6f, 0x66, 0x20, 0x7b, 0x20, 0x22, 0x74, 0x65,
-  0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x6e, 0x61, 0x6d, 0x65, 0x22, 0x3a,
-  0x20, 0x22, 0x74, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x64, 0x61,
-  0x74, 0x61, 0x22, 0x20, 0x7d, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x7b, 0x20,
-  0x22, 0x73, 0x65, 0x74, 0x74, 0x69, 0x6e, 0x67, 0x73, 0x74, 0x65, 0x6d,
-  0x70, 0x6c, 0x61, 0x74, 0x65, 0x6e, 0x61, 0x6d, 0x65, 0x22, 0x3a, 0x22,
-  0x73, 0x65, 0x74, 0x74, 0x69, 0x6e, 0x67, 0x73, 0x64, 0x61, 0x74, 0x61,
-  0x22, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e,
-  0x73, 0x6f, 0x6c, 0x65, 0x2e, 0x6c, 0x6f, 0x67, 0x28, 0x27, 0x49, 0x6d,
-  0x70, 0x6f, 0x72, 0x74, 0x69, 0x6e, 0x67, 0x20, 0x73, 0x61, 0x76, 0x65,
-  0x64, 0x20, 0x74, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x73, 0x27,
-  0x29, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x6c, 0x65, 0x74, 0x20, 0x69,
-  0x6d, 0x70, 0x6f, 0x72, 0x74, 0x65, 0x64, 0x54, 0x65, 0x6d, 0x70, 0x6c,
-  0x61, 0x74, 0x65, 0x73, 0x20, 0x3d, 0x20, 0x6c, 0x6f, 0x63, 0x61, 0x6c,
-  0x5f, 0x73, 0x74, 0x6f, 0x72, 0x61, 0x67, 0x65, 0x5f, 0x67, 0x65, 0x74,
-  0x44, 0x61, 0x74, 0x61, 0x41, 0x73, 0x4f, 0x62, 0x6a, 0x65, 0x63, 0x74,
-  0x28, 0x27, 0x75, 0x73, 0x65, 0x72, 0x5f, 0x74, 0x65, 0x6d, 0x70, 0x6c,
-  0x61, 0x74, 0x65, 0x73, 0x27, 0x29, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20,
-  0x69, 0x66, 0x20, 0x28, 0x69, 0x6d, 0x70, 0x6f, 0x72, 0x74, 0x65, 0x64,
-  0x54, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x73, 0x29, 0x20, 0x7b,
-  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f, 0x20, 0x73, 0x61,
+  0x65, 0x74, 0x74, 0x69, 0x6e, 0x67, 0x73, 0x0a, 0x0a, 0x20, 0x20, 0x20,
+  0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x73, 0x61, 0x76, 0x65, 0x64,
+  0x55, 0x73, 0x65, 0x72, 0x54, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65,
+  0x73, 0x20, 0x3d, 0x20, 0x73, 0x69, 0x67, 0x6e, 0x61, 0x6c, 0x28, 0x7b,
+  0x7d, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74,
+  0x20, 0x73, 0x65, 0x6c, 0x65, 0x63, 0x74, 0x65, 0x64, 0x55, 0x73, 0x65,
+  0x72, 0x54, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x20, 0x3d, 0x20,
+  0x73, 0x69, 0x67, 0x6e, 0x61, 0x6c, 0x28, 0x7b, 0x20, 0x6e, 0x61, 0x6d,
+  0x65, 0x3a, 0x20, 0x27, 0x27, 0x2c, 0x20, 0x74, 0x65, 0x6d, 0x70, 0x6c,
+  0x61, 0x74, 0x65, 0x3a, 0x20, 0x7b, 0x20, 0x73, 0x65, 0x73, 0x73, 0x69,
+  0x6f, 0x6e, 0x3a, 0x20, 0x7b, 0x7d, 0x2c, 0x20, 0x70, 0x61, 0x72, 0x61,
+  0x6d, 0x73, 0x3a, 0x20, 0x7b, 0x7d, 0x20, 0x7d, 0x20, 0x7d, 0x29, 0x0a,
+  0x0a, 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f, 0x20, 0x6c, 0x65, 0x74, 0x27,
+  0x73, 0x20, 0x69, 0x6d, 0x70, 0x6f, 0x72, 0x74, 0x20, 0x6c, 0x6f, 0x63,
+  0x61, 0x6c, 0x6c, 0x79, 0x20, 0x73, 0x61, 0x76, 0x65, 0x64, 0x20, 0x74,
+  0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x73, 0x20, 0x61, 0x6e, 0x64,
+  0x20, 0x73, 0x65, 0x74, 0x74, 0x69, 0x6e, 0x67, 0x73, 0x20, 0x69, 0x66,
+  0x20, 0x74, 0x68, 0x65, 0x72, 0x65, 0x20, 0x61, 0x72, 0x65, 0x20, 0x61,
+  0x6e, 0x79, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f, 0x20, 0x75, 0x73,
+  0x65, 0x72, 0x20, 0x74, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x73,
+  0x20, 0x61, 0x6e, 0x64, 0x20, 0x73, 0x65, 0x74, 0x74, 0x69, 0x6e, 0x67,
+  0x73, 0x20, 0x61, 0x72, 0x65, 0x20, 0x73, 0x74, 0x6f, 0x72, 0x65, 0x64,
+  0x20, 0x69, 0x6e, 0x20, 0x6f, 0x6e, 0x65, 0x20, 0x6f, 0x62, 0x6a, 0x65,
+  0x63, 0x74, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f, 0x20, 0x69, 0x6e,
+  0x20, 0x66, 0x6f, 0x72, 0x6d, 0x20, 0x6f, 0x66, 0x20, 0x7b, 0x20, 0x22,
+  0x74, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x6e, 0x61, 0x6d, 0x65,
+  0x22, 0x3a, 0x20, 0x22, 0x74, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65,
+  0x64, 0x61, 0x74, 0x61, 0x22, 0x20, 0x7d, 0x20, 0x61, 0x6e, 0x64, 0x20,
+  0x7b, 0x20, 0x22, 0x73, 0x65, 0x74, 0x74, 0x69, 0x6e, 0x67, 0x73, 0x74,
+  0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x6e, 0x61, 0x6d, 0x65, 0x22,
+  0x3a, 0x22, 0x73, 0x65, 0x74, 0x74, 0x69, 0x6e, 0x67, 0x73, 0x64, 0x61,
+  0x74, 0x61, 0x22, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63,
+  0x6f, 0x6e, 0x73, 0x6f, 0x6c, 0x65, 0x2e, 0x6c, 0x6f, 0x67, 0x28, 0x27,
+  0x49, 0x6d, 0x70, 0x6f, 0x72, 0x74, 0x69, 0x6e, 0x67, 0x20, 0x73, 0x61,
   0x76, 0x65, 0x64, 0x20, 0x74, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65,
-  0x73, 0x20, 0x77, 0x65, 0x72, 0x65, 0x20, 0x73, 0x75, 0x63, 0x63, 0x65,
-  0x73, 0x73, 0x66, 0x75, 0x6c, 0x79, 0x20, 0x69, 0x6d, 0x70, 0x6f, 0x72,
-  0x74, 0x65, 0x64, 0x2e, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x63, 0x6f, 0x6e, 0x73, 0x6f, 0x6c, 0x65, 0x2e, 0x6c, 0x6f, 0x67, 0x28,
-  0x27, 0x50, 0x72, 0x6f, 0x63, 0x65, 0x73, 0x73, 0x69, 0x6e, 0x67, 0x20,
+  0x73, 0x27, 0x29, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x6c, 0x65, 0x74,
+  0x20, 0x69, 0x6d, 0x70, 0x6f, 0x72, 0x74, 0x65, 0x64, 0x54, 0x65, 0x6d,
+  0x70, 0x6c, 0x61, 0x74, 0x65, 0x73, 0x20, 0x3d, 0x20, 0x6c, 0x6f, 0x63,
+  0x61, 0x6c, 0x5f, 0x73, 0x74, 0x6f, 0x72, 0x61, 0x67, 0x65, 0x5f, 0x67,
+  0x65, 0x74, 0x44, 0x61, 0x74, 0x61, 0x41, 0x73, 0x4f, 0x62, 0x6a, 0x65,
+  0x63, 0x74, 0x28, 0x27, 0x75, 0x73, 0x65, 0x72, 0x5f, 0x74, 0x65, 0x6d,
+  0x70, 0x6c, 0x61, 0x74, 0x65, 0x73, 0x27, 0x29, 0x0a, 0x0a, 0x20, 0x20,
+  0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x69, 0x6d, 0x70, 0x6f, 0x72, 0x74,
+  0x65, 0x64, 0x54, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x73, 0x29,
+  0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f, 0x20,
   0x73, 0x61, 0x76, 0x65, 0x64, 0x20, 0x74, 0x65, 0x6d, 0x70, 0x6c, 0x61,
-  0x74, 0x65, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x75, 0x70, 0x64, 0x61,
-  0x74, 0x69, 0x6e, 0x67, 0x20, 0x64, 0x65, 0x66, 0x61, 0x75, 0x6c, 0x74,
-  0x20, 0x74, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x27, 0x29, 0x0a,
+  0x74, 0x65, 0x73, 0x20, 0x77, 0x65, 0x72, 0x65, 0x20, 0x73, 0x75, 0x63,
+  0x63, 0x65, 0x73, 0x73, 0x66, 0x75, 0x6c, 0x79, 0x20, 0x69, 0x6d, 0x70,
+  0x6f, 0x72, 0x74, 0x65, 0x64, 0x2e, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x6f, 0x6c, 0x65, 0x2e, 0x6c, 0x6f,
+  0x67, 0x28, 0x27, 0x50, 0x72, 0x6f, 0x63, 0x65, 0x73, 0x73, 0x69, 0x6e,
+  0x67, 0x20, 0x73, 0x61, 0x76, 0x65, 0x64, 0x20, 0x74, 0x65, 0x6d, 0x70,
+  0x6c, 0x61, 0x74, 0x65, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x75, 0x70,
+  0x64, 0x61, 0x74, 0x69, 0x6e, 0x67, 0x20, 0x64, 0x65, 0x66, 0x61, 0x75,
+  0x6c, 0x74, 0x20, 0x74, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x27,
+  0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x70, 0x61, 0x72, 0x61,
+  0x6d, 0x73, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x3d, 0x20, 0x7b,
+  0x20, 0x2e, 0x2e, 0x2e, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2e, 0x76,
+  0x61, 0x6c, 0x75, 0x65, 0x2c, 0x20, 0x69, 0x6d, 0x61, 0x67, 0x65, 0x5f,
+  0x64, 0x61, 0x74, 0x61, 0x3a, 0x20, 0x5b, 0x5d, 0x20, 0x7d, 0x3b, 0x0a,
   0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f, 0x63, 0x6f, 0x6e,
   0x73, 0x6f, 0x6c, 0x65, 0x2e, 0x6c, 0x6f, 0x67, 0x28, 0x69, 0x6d, 0x70,
   0x6f, 0x72, 0x74, 0x65, 0x64, 0x54, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74,
@@ -608,515 +626,639 @@ unsigned char index_html[] = {
   0x73, 0x73, 0x69, 0x6f, 0x6e, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20,
   0x3d, 0x20, 0x74, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x2e, 0x73, 0x65, 0x73,
   0x73, 0x69, 0x6f, 0x6e, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x73, 0x65, 0x73, 0x73, 0x69, 0x6f, 0x6e, 0x2e, 0x76, 0x61, 0x6c, 0x75,
+  0x65, 0x20, 0x3d, 0x20, 0x7b, 0x20, 0x2e, 0x2e, 0x2e, 0x73, 0x65, 0x73,
+  0x73, 0x69, 0x6f, 0x6e, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2c, 0x20,
+  0x69, 0x6d, 0x61, 0x67, 0x65, 0x5f, 0x73, 0x65, 0x6c, 0x65, 0x63, 0x74,
+  0x65, 0x64, 0x3a, 0x20, 0x27, 0x27, 0x20, 0x7d, 0x3b, 0x0a, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2e, 0x76,
+  0x61, 0x6c, 0x75, 0x65, 0x20, 0x3d, 0x20, 0x74, 0x2e, 0x64, 0x61, 0x74,
+  0x61, 0x2e, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x3b, 0x0a, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2e, 0x76,
+  0x61, 0x6c, 0x75, 0x65, 0x20, 0x3d, 0x20, 0x7b, 0x20, 0x2e, 0x2e, 0x2e,
   0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65,
-  0x20, 0x3d, 0x20, 0x74, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x2e, 0x70, 0x61,
-  0x72, 0x61, 0x6d, 0x73, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a,
-  0x0a, 0x20, 0x20, 0x20, 0x20, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f,
-  0x6e, 0x20, 0x75, 0x73, 0x65, 0x72, 0x54, 0x65, 0x6d, 0x70, 0x6c, 0x61,
-  0x74, 0x65, 0x52, 0x65, 0x73, 0x65, 0x74, 0x54, 0x6f, 0x44, 0x65, 0x66,
-  0x61, 0x75, 0x6c, 0x74, 0x41, 0x6e, 0x64, 0x41, 0x70, 0x70, 0x6c, 0x79,
-  0x28, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x75,
-  0x73, 0x65, 0x72, 0x54, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x52,
-  0x65, 0x73, 0x65, 0x74, 0x54, 0x6f, 0x44, 0x65, 0x66, 0x61, 0x75, 0x6c,
-  0x74, 0x28, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x75, 0x73,
-  0x65, 0x72, 0x54, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x41, 0x70,
-  0x70, 0x6c, 0x79, 0x28, 0x73, 0x65, 0x6c, 0x65, 0x63, 0x74, 0x65, 0x64,
-  0x55, 0x73, 0x65, 0x72, 0x54, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65,
-  0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20,
+  0x2c, 0x20, 0x69, 0x6d, 0x61, 0x67, 0x65, 0x5f, 0x64, 0x61, 0x74, 0x61,
+  0x3a, 0x20, 0x5b, 0x5d, 0x20, 0x7d, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20,
   0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x66, 0x75, 0x6e, 0x63, 0x74,
   0x69, 0x6f, 0x6e, 0x20, 0x75, 0x73, 0x65, 0x72, 0x54, 0x65, 0x6d, 0x70,
-  0x6c, 0x61, 0x74, 0x65, 0x4c, 0x6f, 0x61, 0x64, 0x41, 0x6e, 0x64, 0x41,
-  0x70, 0x70, 0x6c, 0x79, 0x41, 0x75, 0x74, 0x6f, 0x73, 0x61, 0x76, 0x65,
-  0x64, 0x28, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x2f, 0x2f, 0x20, 0x67, 0x65, 0x74, 0x20, 0x61, 0x75, 0x74, 0x6f, 0x73,
-  0x61, 0x76, 0x65, 0x64, 0x20, 0x6c, 0x61, 0x73, 0x74, 0x20, 0x75, 0x73,
-  0x65, 0x64, 0x20, 0x74, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x0a,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6c, 0x65, 0x74, 0x20, 0x6c, 0x61,
-  0x73, 0x74, 0x55, 0x73, 0x65, 0x64, 0x54, 0x65, 0x6d, 0x70, 0x6c, 0x61,
-  0x74, 0x65, 0x20, 0x3d, 0x20, 0x6c, 0x6f, 0x63, 0x61, 0x6c, 0x5f, 0x73,
-  0x74, 0x6f, 0x72, 0x61, 0x67, 0x65, 0x5f, 0x67, 0x65, 0x74, 0x44, 0x61,
-  0x74, 0x61, 0x41, 0x73, 0x4f, 0x62, 0x6a, 0x65, 0x63, 0x74, 0x28, 0x27,
-  0x75, 0x73, 0x65, 0x72, 0x5f, 0x74, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74,
-  0x65, 0x73, 0x5f, 0x6c, 0x61, 0x73, 0x74, 0x27, 0x29, 0x0a, 0x0a, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x6c, 0x61, 0x73,
+  0x6c, 0x61, 0x74, 0x65, 0x52, 0x65, 0x73, 0x65, 0x74, 0x54, 0x6f, 0x44,
+  0x65, 0x66, 0x61, 0x75, 0x6c, 0x74, 0x41, 0x6e, 0x64, 0x41, 0x70, 0x70,
+  0x6c, 0x79, 0x28, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x75, 0x73, 0x65, 0x72, 0x54, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74,
+  0x65, 0x52, 0x65, 0x73, 0x65, 0x74, 0x54, 0x6f, 0x44, 0x65, 0x66, 0x61,
+  0x75, 0x6c, 0x74, 0x28, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x75, 0x73, 0x65, 0x72, 0x54, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65,
+  0x41, 0x70, 0x70, 0x6c, 0x79, 0x28, 0x73, 0x65, 0x6c, 0x65, 0x63, 0x74,
+  0x65, 0x64, 0x55, 0x73, 0x65, 0x72, 0x54, 0x65, 0x6d, 0x70, 0x6c, 0x61,
+  0x74, 0x65, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x29, 0x0a, 0x20, 0x20,
+  0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x66, 0x75, 0x6e,
+  0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x75, 0x73, 0x65, 0x72, 0x54, 0x65,
+  0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x4c, 0x6f, 0x61, 0x64, 0x41, 0x6e,
+  0x64, 0x41, 0x70, 0x70, 0x6c, 0x79, 0x41, 0x75, 0x74, 0x6f, 0x73, 0x61,
+  0x76, 0x65, 0x64, 0x28, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x2f, 0x2f, 0x20, 0x67, 0x65, 0x74, 0x20, 0x61, 0x75, 0x74,
+  0x6f, 0x73, 0x61, 0x76, 0x65, 0x64, 0x20, 0x6c, 0x61, 0x73, 0x74, 0x20,
+  0x75, 0x73, 0x65, 0x64, 0x20, 0x74, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74,
+  0x65, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6c, 0x65, 0x74, 0x20,
+  0x6c, 0x61, 0x73, 0x74, 0x55, 0x73, 0x65, 0x64, 0x54, 0x65, 0x6d, 0x70,
+  0x6c, 0x61, 0x74, 0x65, 0x20, 0x3d, 0x20, 0x6c, 0x6f, 0x63, 0x61, 0x6c,
+  0x5f, 0x73, 0x74, 0x6f, 0x72, 0x61, 0x67, 0x65, 0x5f, 0x67, 0x65, 0x74,
+  0x44, 0x61, 0x74, 0x61, 0x41, 0x73, 0x4f, 0x62, 0x6a, 0x65, 0x63, 0x74,
+  0x28, 0x27, 0x75, 0x73, 0x65, 0x72, 0x5f, 0x74, 0x65, 0x6d, 0x70, 0x6c,
+  0x61, 0x74, 0x65, 0x73, 0x5f, 0x6c, 0x61, 0x73, 0x74, 0x27, 0x29, 0x0a,
+  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x6c,
+  0x61, 0x73, 0x74, 0x55, 0x73, 0x65, 0x64, 0x54, 0x65, 0x6d, 0x70, 0x6c,
+  0x61, 0x74, 0x65, 0x29, 0x20, 0x7b, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x6f, 0x6c, 0x65, 0x2e,
+  0x6c, 0x6f, 0x67, 0x28, 0x27, 0x41, 0x75, 0x74, 0x6f, 0x73, 0x61, 0x76,
+  0x65, 0x64, 0x20, 0x74, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x20,
+  0x66, 0x6f, 0x75, 0x6e, 0x64, 0x2c, 0x20, 0x72, 0x65, 0x73, 0x74, 0x6f,
+  0x72, 0x69, 0x6e, 0x67, 0x27, 0x29, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x73, 0x65, 0x6c, 0x65, 0x63, 0x74, 0x65, 0x64,
+  0x55, 0x73, 0x65, 0x72, 0x54, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65,
+  0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x3d, 0x20, 0x6c, 0x61, 0x73,
   0x74, 0x55, 0x73, 0x65, 0x64, 0x54, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74,
-  0x65, 0x29, 0x20, 0x7b, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x6f, 0x6c, 0x65, 0x2e, 0x6c, 0x6f,
-  0x67, 0x28, 0x27, 0x41, 0x75, 0x74, 0x6f, 0x73, 0x61, 0x76, 0x65, 0x64,
-  0x20, 0x74, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x20, 0x66, 0x6f,
-  0x75, 0x6e, 0x64, 0x2c, 0x20, 0x72, 0x65, 0x73, 0x74, 0x6f, 0x72, 0x69,
-  0x6e, 0x67, 0x27, 0x29, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x73, 0x65, 0x6c, 0x65, 0x63, 0x74, 0x65, 0x64, 0x55, 0x73,
-  0x65, 0x72, 0x54, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x2e, 0x76,
-  0x61, 0x6c, 0x75, 0x65, 0x20, 0x3d, 0x20, 0x6c, 0x61, 0x73, 0x74, 0x55,
-  0x73, 0x65, 0x64, 0x54, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x0a,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x65, 0x6c, 0x73, 0x65, 0x20, 0x7b, 0x0a, 0x0a, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x6f, 0x6c,
-  0x65, 0x2e, 0x6c, 0x6f, 0x67, 0x28, 0x27, 0x4e, 0x6f, 0x20, 0x61, 0x75,
-  0x74, 0x6f, 0x73, 0x61, 0x76, 0x65, 0x64, 0x20, 0x74, 0x65, 0x6d, 0x70,
-  0x6c, 0x61, 0x74, 0x65, 0x20, 0x66, 0x6f, 0x75, 0x6e, 0x64, 0x2c, 0x20,
-  0x75, 0x73, 0x69, 0x6e, 0x67, 0x20, 0x64, 0x65, 0x66, 0x61, 0x75, 0x6c,
-  0x74, 0x20, 0x74, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x27, 0x29,
-  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f, 0x20,
-  0x6e, 0x6f, 0x20, 0x61, 0x75, 0x74, 0x6f, 0x73, 0x61, 0x76, 0x65, 0x64,
-  0x20, 0x6c, 0x61, 0x73, 0x74, 0x20, 0x75, 0x73, 0x65, 0x64, 0x20, 0x74,
-  0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x20, 0x77, 0x61, 0x73, 0x20,
-  0x66, 0x6f, 0x75, 0x6e, 0x64, 0x2c, 0x20, 0x73, 0x6f, 0x20, 0x6c, 0x6f,
-  0x61, 0x64, 0x20, 0x66, 0x72, 0x6f, 0x6d, 0x20, 0x64, 0x65, 0x66, 0x61,
-  0x75, 0x6c, 0x74, 0x2e, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x75, 0x73, 0x65, 0x72, 0x54, 0x65, 0x6d, 0x70, 0x6c, 0x61,
-  0x74, 0x65, 0x52, 0x65, 0x73, 0x65, 0x74, 0x54, 0x6f, 0x44, 0x65, 0x66,
-  0x61, 0x75, 0x6c, 0x74, 0x28, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f,
-  0x6e, 0x73, 0x6f, 0x6c, 0x65, 0x2e, 0x6c, 0x6f, 0x67, 0x28, 0x27, 0x41,
-  0x70, 0x70, 0x6c, 0x79, 0x69, 0x6e, 0x67, 0x20, 0x74, 0x65, 0x6d, 0x70,
-  0x6c, 0x61, 0x74, 0x65, 0x27, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x2f, 0x2f, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x75, 0x70, 0x64, 0x61,
-  0x74, 0x65, 0x20, 0x69, 0x6e, 0x74, 0x65, 0x72, 0x6e, 0x61, 0x6c, 0x20,
-  0x64, 0x61, 0x74, 0x61, 0x20, 0x66, 0x72, 0x6f, 0x6d, 0x20, 0x74, 0x65,
-  0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x73, 0x0a, 0x0a, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x75, 0x73, 0x65, 0x72, 0x54, 0x65, 0x6d, 0x70, 0x6c,
-  0x61, 0x74, 0x65, 0x41, 0x70, 0x70, 0x6c, 0x79, 0x28, 0x73, 0x65, 0x6c,
-  0x65, 0x63, 0x74, 0x65, 0x64, 0x55, 0x73, 0x65, 0x72, 0x54, 0x65, 0x6d,
-  0x70, 0x6c, 0x61, 0x74, 0x65, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x29,
-  0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20,
-  0x2f, 0x2f, 0x63, 0x6f, 0x6e, 0x73, 0x6f, 0x6c, 0x65, 0x2e, 0x6c, 0x6f,
-  0x67, 0x28, 0x73, 0x61, 0x76, 0x65, 0x64, 0x55, 0x73, 0x65, 0x72, 0x54,
-  0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x73, 0x2e, 0x76, 0x61, 0x6c,
-  0x75, 0x65, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f, 0x63, 0x6f,
-  0x6e, 0x73, 0x6f, 0x6c, 0x65, 0x2e, 0x6c, 0x6f, 0x67, 0x28, 0x73, 0x65,
-  0x6c, 0x65, 0x63, 0x74, 0x65, 0x64, 0x55, 0x73, 0x65, 0x72, 0x54, 0x65,
-  0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65,
-  0x29, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x66, 0x75, 0x6e, 0x63, 0x74,
-  0x69, 0x6f, 0x6e, 0x20, 0x75, 0x73, 0x65, 0x72, 0x54, 0x65, 0x6d, 0x70,
-  0x6c, 0x61, 0x74, 0x65, 0x41, 0x75, 0x74, 0x6f, 0x73, 0x61, 0x76, 0x65,
-  0x28, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63,
-  0x6f, 0x6e, 0x73, 0x6f, 0x6c, 0x65, 0x2e, 0x6c, 0x6f, 0x67, 0x28, 0x27,
-  0x54, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x20, 0x41, 0x75, 0x74,
-  0x6f, 0x73, 0x61, 0x76, 0x65, 0x2e, 0x2e, 0x2e, 0x27, 0x29, 0x0a, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x73, 0x65, 0x6c,
-  0x65, 0x63, 0x74, 0x65, 0x64, 0x55, 0x73, 0x65, 0x72, 0x54, 0x65, 0x6d,
-  0x70, 0x6c, 0x61, 0x74, 0x65, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e,
-  0x6e, 0x61, 0x6d, 0x65, 0x20, 0x3d, 0x3d, 0x20, 0x27, 0x64, 0x65, 0x66,
-  0x61, 0x75, 0x6c, 0x74, 0x27, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f, 0x20, 0x77, 0x65, 0x20, 0x64,
-  0x6f, 0x6e, 0x27, 0x74, 0x20, 0x77, 0x61, 0x6e, 0x74, 0x20, 0x74, 0x6f,
-  0x20, 0x73, 0x61, 0x76, 0x65, 0x20, 0x6f, 0x76, 0x65, 0x72, 0x20, 0x64,
-  0x65, 0x66, 0x61, 0x75, 0x6c, 0x74, 0x20, 0x74, 0x65, 0x6d, 0x70, 0x6c,
-  0x61, 0x74, 0x65, 0x2c, 0x20, 0x73, 0x6f, 0x20, 0x6c, 0x65, 0x74, 0x27,
-  0x73, 0x20, 0x63, 0x72, 0x65, 0x61, 0x74, 0x65, 0x20, 0x61, 0x20, 0x6e,
-  0x65, 0x77, 0x20, 0x6f, 0x6e, 0x65, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x65, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x65, 0x6c, 0x73, 0x65, 0x20, 0x7b, 0x0a, 0x0a,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73,
+  0x6f, 0x6c, 0x65, 0x2e, 0x6c, 0x6f, 0x67, 0x28, 0x27, 0x4e, 0x6f, 0x20,
+  0x61, 0x75, 0x74, 0x6f, 0x73, 0x61, 0x76, 0x65, 0x64, 0x20, 0x74, 0x65,
+  0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x20, 0x66, 0x6f, 0x75, 0x6e, 0x64,
+  0x2c, 0x20, 0x75, 0x73, 0x69, 0x6e, 0x67, 0x20, 0x64, 0x65, 0x66, 0x61,
+  0x75, 0x6c, 0x74, 0x20, 0x74, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65,
+  0x27, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2f,
+  0x2f, 0x20, 0x6e, 0x6f, 0x20, 0x61, 0x75, 0x74, 0x6f, 0x73, 0x61, 0x76,
+  0x65, 0x64, 0x20, 0x6c, 0x61, 0x73, 0x74, 0x20, 0x75, 0x73, 0x65, 0x64,
+  0x20, 0x74, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x20, 0x77, 0x61,
+  0x73, 0x20, 0x66, 0x6f, 0x75, 0x6e, 0x64, 0x2c, 0x20, 0x73, 0x6f, 0x20,
+  0x6c, 0x6f, 0x61, 0x64, 0x20, 0x66, 0x72, 0x6f, 0x6d, 0x20, 0x64, 0x65,
+  0x66, 0x61, 0x75, 0x6c, 0x74, 0x2e, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x75, 0x73, 0x65, 0x72, 0x54, 0x65, 0x6d, 0x70,
+  0x6c, 0x61, 0x74, 0x65, 0x52, 0x65, 0x73, 0x65, 0x74, 0x54, 0x6f, 0x44,
+  0x65, 0x66, 0x61, 0x75, 0x6c, 0x74, 0x28, 0x29, 0x0a, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x63, 0x6f, 0x6e, 0x73, 0x6f, 0x6c, 0x65, 0x2e, 0x6c, 0x6f, 0x67, 0x28,
+  0x27, 0x41, 0x70, 0x70, 0x6c, 0x79, 0x69, 0x6e, 0x67, 0x20, 0x74, 0x65,
+  0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x27, 0x29, 0x0a, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x2f, 0x2f, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x75, 0x70,
+  0x64, 0x61, 0x74, 0x65, 0x20, 0x69, 0x6e, 0x74, 0x65, 0x72, 0x6e, 0x61,
+  0x6c, 0x20, 0x64, 0x61, 0x74, 0x61, 0x20, 0x66, 0x72, 0x6f, 0x6d, 0x20,
+  0x74, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x73, 0x0a, 0x0a, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x75, 0x73, 0x65, 0x72, 0x54, 0x65, 0x6d,
+  0x70, 0x6c, 0x61, 0x74, 0x65, 0x41, 0x70, 0x70, 0x6c, 0x79, 0x28, 0x73,
+  0x65, 0x6c, 0x65, 0x63, 0x74, 0x65, 0x64, 0x55, 0x73, 0x65, 0x72, 0x54,
+  0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x2e, 0x76, 0x61, 0x6c, 0x75,
+  0x65, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20,
+  0x20, 0x20, 0x2f, 0x2f, 0x63, 0x6f, 0x6e, 0x73, 0x6f, 0x6c, 0x65, 0x2e,
+  0x6c, 0x6f, 0x67, 0x28, 0x73, 0x61, 0x76, 0x65, 0x64, 0x55, 0x73, 0x65,
+  0x72, 0x54, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x73, 0x2e, 0x76,
+  0x61, 0x6c, 0x75, 0x65, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f,
+  0x63, 0x6f, 0x6e, 0x73, 0x6f, 0x6c, 0x65, 0x2e, 0x6c, 0x6f, 0x67, 0x28,
+  0x73, 0x65, 0x6c, 0x65, 0x63, 0x74, 0x65, 0x64, 0x55, 0x73, 0x65, 0x72,
+  0x54, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x2e, 0x76, 0x61, 0x6c,
+  0x75, 0x65, 0x29, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x66, 0x75, 0x6e,
+  0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x75, 0x73, 0x65, 0x72, 0x54, 0x65,
+  0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x41, 0x75, 0x74, 0x6f, 0x73, 0x61,
+  0x76, 0x65, 0x28, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x63, 0x6f, 0x6e, 0x73, 0x6f, 0x6c, 0x65, 0x2e, 0x6c, 0x6f, 0x67,
+  0x28, 0x27, 0x54, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x20, 0x41,
+  0x75, 0x74, 0x6f, 0x73, 0x61, 0x76, 0x65, 0x2e, 0x2e, 0x2e, 0x27, 0x29,
+  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x73,
+  0x65, 0x6c, 0x65, 0x63, 0x74, 0x65, 0x64, 0x55, 0x73, 0x65, 0x72, 0x54,
+  0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x2e, 0x76, 0x61, 0x6c, 0x75,
+  0x65, 0x2e, 0x6e, 0x61, 0x6d, 0x65, 0x20, 0x3d, 0x3d, 0x20, 0x27, 0x64,
+  0x65, 0x66, 0x61, 0x75, 0x6c, 0x74, 0x27, 0x29, 0x20, 0x7b, 0x0a, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f, 0x20, 0x77, 0x65,
+  0x20, 0x64, 0x6f, 0x6e, 0x27, 0x74, 0x20, 0x77, 0x61, 0x6e, 0x74, 0x20,
+  0x74, 0x6f, 0x20, 0x73, 0x61, 0x76, 0x65, 0x20, 0x6f, 0x76, 0x65, 0x72,
+  0x20, 0x64, 0x65, 0x66, 0x61, 0x75, 0x6c, 0x74, 0x20, 0x74, 0x65, 0x6d,
+  0x70, 0x6c, 0x61, 0x74, 0x65, 0x2c, 0x20, 0x73, 0x6f, 0x20, 0x6c, 0x65,
+  0x74, 0x27, 0x73, 0x20, 0x63, 0x72, 0x65, 0x61, 0x74, 0x65, 0x20, 0x61,
+  0x20, 0x6e, 0x65, 0x77, 0x20, 0x6f, 0x6e, 0x65, 0x0a, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x6c, 0x65, 0x74, 0x20, 0x6e, 0x65, 0x77,
+  0x54, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x4e, 0x61, 0x6d, 0x65,
+  0x20, 0x3d, 0x20, 0x27, 0x55, 0x73, 0x65, 0x72, 0x54, 0x65, 0x6d, 0x70,
+  0x6c, 0x61, 0x74, 0x65, 0x2d, 0x27, 0x20, 0x2b, 0x20, 0x44, 0x61, 0x74,
+  0x65, 0x2e, 0x6e, 0x6f, 0x77, 0x28, 0x29, 0x2e, 0x74, 0x6f, 0x53, 0x74,
+  0x72, 0x69, 0x6e, 0x67, 0x28, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
   0x20, 0x20, 0x20, 0x6c, 0x65, 0x74, 0x20, 0x6e, 0x65, 0x77, 0x54, 0x65,
-  0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x4e, 0x61, 0x6d, 0x65, 0x20, 0x3d,
-  0x20, 0x27, 0x55, 0x73, 0x65, 0x72, 0x54, 0x65, 0x6d, 0x70, 0x6c, 0x61,
-  0x74, 0x65, 0x2d, 0x27, 0x20, 0x2b, 0x20, 0x44, 0x61, 0x74, 0x65, 0x2e,
-  0x6e, 0x6f, 0x77, 0x28, 0x29, 0x2e, 0x74, 0x6f, 0x53, 0x74, 0x72, 0x69,
-  0x6e, 0x67, 0x28, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x6c, 0x65, 0x74, 0x20, 0x6e, 0x65, 0x77, 0x54, 0x65, 0x6d, 0x70,
-  0x6c, 0x61, 0x74, 0x65, 0x20, 0x3d, 0x20, 0x7b, 0x20, 0x27, 0x6e, 0x61,
-  0x6d, 0x65, 0x27, 0x3a, 0x20, 0x6e, 0x65, 0x77, 0x54, 0x65, 0x6d, 0x70,
-  0x6c, 0x61, 0x74, 0x65, 0x4e, 0x61, 0x6d, 0x65, 0x2c, 0x20, 0x27, 0x64,
-  0x61, 0x74, 0x61, 0x27, 0x3a, 0x20, 0x7b, 0x20, 0x27, 0x73, 0x65, 0x73,
-  0x73, 0x69, 0x6f, 0x6e, 0x27, 0x3a, 0x20, 0x73, 0x65, 0x73, 0x73, 0x69,
-  0x6f, 0x6e, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2c, 0x20, 0x27, 0x70,
-  0x61, 0x72, 0x61, 0x6d, 0x73, 0x27, 0x3a, 0x20, 0x70, 0x61, 0x72, 0x61,
-  0x6d, 0x73, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x7d, 0x20, 0x7d,
-  0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f,
-  0x6e, 0x73, 0x6f, 0x6c, 0x65, 0x2e, 0x6c, 0x6f, 0x67, 0x28, 0x27, 0x53,
-  0x61, 0x76, 0x69, 0x6e, 0x67, 0x20, 0x61, 0x73, 0x20, 0x27, 0x20, 0x2b,
-  0x20, 0x6e, 0x65, 0x77, 0x54, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65,
-  0x4e, 0x61, 0x6d, 0x65, 0x29, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x2f, 0x2f, 0x20, 0x73, 0x61, 0x76, 0x65, 0x20, 0x69,
-  0x6e, 0x20, 0x74, 0x68, 0x65, 0x20, 0x61, 0x75, 0x74, 0x6f, 0x73, 0x61,
-  0x76, 0x65, 0x20, 0x73, 0x6c, 0x6f, 0x74, 0x0a, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x6c, 0x6f, 0x63, 0x61, 0x6c, 0x5f, 0x73, 0x74,
-  0x6f, 0x72, 0x61, 0x67, 0x65, 0x5f, 0x73, 0x65, 0x74, 0x44, 0x61, 0x74,
-  0x61, 0x46, 0x72, 0x6f, 0x6d, 0x4f, 0x62, 0x6a, 0x65, 0x63, 0x74, 0x28,
-  0x27, 0x75, 0x73, 0x65, 0x72, 0x5f, 0x74, 0x65, 0x6d, 0x70, 0x6c, 0x61,
-  0x74, 0x65, 0x73, 0x5f, 0x6c, 0x61, 0x73, 0x74, 0x27, 0x2c, 0x20, 0x6e,
-  0x65, 0x77, 0x54, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x29, 0x0a,
-  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f, 0x20,
-  0x61, 0x6e, 0x64, 0x20, 0x6c, 0x6f, 0x61, 0x64, 0x20, 0x69, 0x74, 0x20,
-  0x62, 0x61, 0x63, 0x6b, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x61, 0x70, 0x70,
-  0x6c, 0x79, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x75,
-  0x73, 0x65, 0x72, 0x54, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x4c,
-  0x6f, 0x61, 0x64, 0x41, 0x6e, 0x64, 0x41, 0x70, 0x70, 0x6c, 0x79, 0x41,
-  0x75, 0x74, 0x6f, 0x73, 0x61, 0x76, 0x65, 0x64, 0x28, 0x29, 0x0a, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x20, 0x65, 0x6c, 0x73, 0x65, 0x20,
-  0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6c, 0x6f,
-  0x63, 0x61, 0x6c, 0x5f, 0x73, 0x74, 0x6f, 0x72, 0x61, 0x67, 0x65, 0x5f,
-  0x73, 0x65, 0x74, 0x44, 0x61, 0x74, 0x61, 0x46, 0x72, 0x6f, 0x6d, 0x4f,
-  0x62, 0x6a, 0x65, 0x63, 0x74, 0x28, 0x27, 0x75, 0x73, 0x65, 0x72, 0x5f,
-  0x74, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x73, 0x5f, 0x6c, 0x61,
-  0x73, 0x74, 0x27, 0x2c, 0x20, 0x7b, 0x20, 0x27, 0x6e, 0x61, 0x6d, 0x65,
-  0x27, 0x3a, 0x20, 0x73, 0x65, 0x6c, 0x65, 0x63, 0x74, 0x65, 0x64, 0x55,
-  0x73, 0x65, 0x72, 0x54, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x2e,
-  0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e, 0x6e, 0x61, 0x6d, 0x65, 0x2c, 0x20,
+  0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x20, 0x3d, 0x20, 0x7b, 0x20, 0x27,
+  0x6e, 0x61, 0x6d, 0x65, 0x27, 0x3a, 0x20, 0x6e, 0x65, 0x77, 0x54, 0x65,
+  0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x4e, 0x61, 0x6d, 0x65, 0x2c, 0x20,
   0x27, 0x64, 0x61, 0x74, 0x61, 0x27, 0x3a, 0x20, 0x7b, 0x20, 0x27, 0x73,
   0x65, 0x73, 0x73, 0x69, 0x6f, 0x6e, 0x27, 0x3a, 0x20, 0x73, 0x65, 0x73,
   0x73, 0x69, 0x6f, 0x6e, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2c, 0x20,
   0x27, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x27, 0x3a, 0x20, 0x70, 0x61,
   0x72, 0x61, 0x6d, 0x73, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x7d,
-  0x20, 0x7d, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a,
-  0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63,
-  0x6f, 0x6e, 0x73, 0x6f, 0x6c, 0x65, 0x2e, 0x6c, 0x6f, 0x67, 0x28, 0x27,
-  0x43, 0x68, 0x65, 0x63, 0x6b, 0x69, 0x6e, 0x67, 0x20, 0x66, 0x6f, 0x72,
-  0x20, 0x61, 0x75, 0x74, 0x6f, 0x73, 0x61, 0x76, 0x65, 0x64, 0x20, 0x6c,
-  0x61, 0x73, 0x74, 0x20, 0x75, 0x73, 0x65, 0x64, 0x20, 0x74, 0x65, 0x6d,
-  0x70, 0x6c, 0x61, 0x74, 0x65, 0x27, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20,
-  0x75, 0x73, 0x65, 0x72, 0x54, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65,
-  0x4c, 0x6f, 0x61, 0x64, 0x41, 0x6e, 0x64, 0x41, 0x70, 0x70, 0x6c, 0x79,
-  0x41, 0x75, 0x74, 0x6f, 0x73, 0x61, 0x76, 0x65, 0x64, 0x28, 0x29, 0x0a,
-  0x0a, 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2a, 0x20, 0x45, 0x4e, 0x44, 0x3a,
-  0x20, 0x53, 0x75, 0x70, 0x70, 0x6f, 0x72, 0x74, 0x20, 0x66, 0x6f, 0x72,
-  0x20, 0x73, 0x74, 0x6f, 0x72, 0x69, 0x6e, 0x67, 0x20, 0x70, 0x72, 0x6f,
-  0x6d, 0x70, 0x74, 0x20, 0x74, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65,
-  0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x65,
-  0x74, 0x65, 0x72, 0x73, 0x20, 0x69, 0x6e, 0x20, 0x62, 0x72, 0x6f, 0x77,
-  0x73, 0x65, 0x72, 0x73, 0x20, 0x4c, 0x6f, 0x63, 0x61, 0x6c, 0x53, 0x74,
-  0x6f, 0x72, 0x61, 0x67, 0x65, 0x20, 0x2a, 0x2f, 0x0a, 0x0a, 0x20, 0x20,
-  0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x6c, 0x6c, 0x61, 0x6d,
-  0x61, 0x53, 0x74, 0x61, 0x74, 0x73, 0x20, 0x3d, 0x20, 0x73, 0x69, 0x67,
-  0x6e, 0x61, 0x6c, 0x28, 0x6e, 0x75, 0x6c, 0x6c, 0x29, 0x0a, 0x20, 0x20,
-  0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x63, 0x6f, 0x6e, 0x74,
-  0x72, 0x6f, 0x6c, 0x6c, 0x65, 0x72, 0x20, 0x3d, 0x20, 0x73, 0x69, 0x67,
-  0x6e, 0x61, 0x6c, 0x28, 0x6e, 0x75, 0x6c, 0x6c, 0x29, 0x0a, 0x0a, 0x20,
-  0x20, 0x20, 0x20, 0x2f, 0x2f, 0x20, 0x63, 0x75, 0x72, 0x72, 0x65, 0x6e,
-  0x74, 0x6c, 0x79, 0x20, 0x67, 0x65, 0x6e, 0x65, 0x72, 0x61, 0x74, 0x69,
-  0x6e, 0x67, 0x20, 0x61, 0x20, 0x63, 0x6f, 0x6d, 0x70, 0x6c, 0x65, 0x74,
-  0x69, 0x6f, 0x6e, 0x3f, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e,
-  0x73, 0x74, 0x20, 0x67, 0x65, 0x6e, 0x65, 0x72, 0x61, 0x74, 0x69, 0x6e,
-  0x67, 0x20, 0x3d, 0x20, 0x63, 0x6f, 0x6d, 0x70, 0x75, 0x74, 0x65, 0x64,
-  0x28, 0x28, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x72,
-  0x6f, 0x6c, 0x6c, 0x65, 0x72, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20,
-  0x21, 0x3d, 0x20, 0x6e, 0x75, 0x6c, 0x6c, 0x29, 0x0a, 0x0a, 0x20, 0x20,
-  0x20, 0x20, 0x2f, 0x2f, 0x20, 0x68, 0x61, 0x73, 0x20, 0x74, 0x68, 0x65,
-  0x20, 0x75, 0x73, 0x65, 0x72, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x65,
-  0x64, 0x20, 0x61, 0x20, 0x63, 0x68, 0x61, 0x74, 0x3f, 0x0a, 0x20, 0x20,
-  0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x63, 0x68, 0x61, 0x74,
-  0x53, 0x74, 0x61, 0x72, 0x74, 0x65, 0x64, 0x20, 0x3d, 0x20, 0x63, 0x6f,
-  0x6d, 0x70, 0x75, 0x74, 0x65, 0x64, 0x28, 0x28, 0x29, 0x20, 0x3d, 0x3e,
-  0x20, 0x73, 0x65, 0x73, 0x73, 0x69, 0x6f, 0x6e, 0x2e, 0x76, 0x61, 0x6c,
-  0x75, 0x65, 0x2e, 0x74, 0x72, 0x61, 0x6e, 0x73, 0x63, 0x72, 0x69, 0x70,
-  0x74, 0x2e, 0x6c, 0x65, 0x6e, 0x67, 0x74, 0x68, 0x20, 0x3e, 0x20, 0x30,
-  0x29, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74,
-  0x20, 0x74, 0x72, 0x61, 0x6e, 0x73, 0x63, 0x72, 0x69, 0x70, 0x74, 0x55,
-  0x70, 0x64, 0x61, 0x74, 0x65, 0x20, 0x3d, 0x20, 0x28, 0x74, 0x72, 0x61,
-  0x6e, 0x73, 0x63, 0x72, 0x69, 0x70, 0x74, 0x29, 0x20, 0x3d, 0x3e, 0x20,
-  0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x73, 0x65, 0x73, 0x73,
-  0x69, 0x6f, 0x6e, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x3d, 0x20,
-  0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2e, 0x2e,
-  0x2e, 0x73, 0x65, 0x73, 0x73, 0x69, 0x6f, 0x6e, 0x2e, 0x76, 0x61, 0x6c,
-  0x75, 0x65, 0x2c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x74, 0x72, 0x61, 0x6e, 0x73, 0x63, 0x72, 0x69, 0x70, 0x74, 0x0a, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d,
-  0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f, 0x20, 0x73, 0x69, 0x6d,
-  0x70, 0x6c, 0x65, 0x20, 0x74, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65,
-  0x20, 0x72, 0x65, 0x70, 0x6c, 0x61, 0x63, 0x65, 0x0a, 0x20, 0x20, 0x20,
-  0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x74, 0x65, 0x6d, 0x70, 0x6c,
-  0x61, 0x74, 0x65, 0x20, 0x3d, 0x20, 0x28, 0x73, 0x74, 0x72, 0x2c, 0x20,
-  0x65, 0x78, 0x74, 0x72, 0x61, 0x53, 0x65, 0x74, 0x74, 0x69, 0x6e, 0x67,
-  0x73, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x6c, 0x65, 0x74, 0x20, 0x73, 0x65, 0x74, 0x74, 0x69, 0x6e,
-  0x67, 0x73, 0x20, 0x3d, 0x20, 0x73, 0x65, 0x73, 0x73, 0x69, 0x6f, 0x6e,
-  0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x65, 0x78, 0x74, 0x72, 0x61, 0x53,
-  0x65, 0x74, 0x74, 0x69, 0x6e, 0x67, 0x73, 0x29, 0x20, 0x7b, 0x0a, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x73, 0x65, 0x74, 0x74, 0x69,
-  0x6e, 0x67, 0x73, 0x20, 0x3d, 0x20, 0x7b, 0x20, 0x2e, 0x2e, 0x2e, 0x73,
-  0x65, 0x74, 0x74, 0x69, 0x6e, 0x67, 0x73, 0x2c, 0x20, 0x2e, 0x2e, 0x2e,
-  0x65, 0x78, 0x74, 0x72, 0x61, 0x53, 0x65, 0x74, 0x74, 0x69, 0x6e, 0x67,
-  0x73, 0x20, 0x7d, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d,
-  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72,
-  0x6e, 0x20, 0x53, 0x74, 0x72, 0x69, 0x6e, 0x67, 0x28, 0x73, 0x74, 0x72,
-  0x29, 0x2e, 0x72, 0x65, 0x70, 0x6c, 0x61, 0x63, 0x65, 0x41, 0x6c, 0x6c,
-  0x28, 0x2f, 0x5c, 0x7b, 0x5c, 0x7b, 0x28, 0x2e, 0x2a, 0x3f, 0x29, 0x5c,
-  0x7d, 0x5c, 0x7d, 0x2f, 0x67, 0x2c, 0x20, 0x28, 0x5f, 0x2c, 0x20, 0x6b,
-  0x65, 0x79, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x74, 0x65, 0x6d, 0x70, 0x6c,
-  0x61, 0x74, 0x65, 0x28, 0x73, 0x65, 0x74, 0x74, 0x69, 0x6e, 0x67, 0x73,
-  0x5b, 0x6b, 0x65, 0x79, 0x5d, 0x29, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20,
-  0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x61, 0x73, 0x79, 0x6e,
-  0x63, 0x20, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x72,
-  0x75, 0x6e, 0x4c, 0x6c, 0x61, 0x6d, 0x61, 0x28, 0x70, 0x72, 0x6f, 0x6d,
-  0x70, 0x74, 0x2c, 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x50, 0x61, 0x72,
-  0x61, 0x6d, 0x73, 0x2c, 0x20, 0x63, 0x68, 0x61, 0x72, 0x29, 0x20, 0x7b,
-  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74,
-  0x20, 0x63, 0x75, 0x72, 0x72, 0x65, 0x6e, 0x74, 0x4d, 0x65, 0x73, 0x73,
-  0x61, 0x67, 0x65, 0x73, 0x20, 0x3d, 0x20, 0x5b, 0x5d, 0x3b, 0x0a, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x68,
-  0x69, 0x73, 0x74, 0x6f, 0x72, 0x79, 0x20, 0x3d, 0x20, 0x73, 0x65, 0x73,
-  0x73, 0x69, 0x6f, 0x6e, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e, 0x74,
-  0x72, 0x61, 0x6e, 0x73, 0x63, 0x72, 0x69, 0x70, 0x74, 0x3b, 0x0a, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x63, 0x6f, 0x6e,
+  0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x63, 0x6f, 0x6e, 0x73, 0x6f, 0x6c, 0x65, 0x2e, 0x6c, 0x6f, 0x67, 0x28,
+  0x27, 0x53, 0x61, 0x76, 0x69, 0x6e, 0x67, 0x20, 0x61, 0x73, 0x20, 0x27,
+  0x20, 0x2b, 0x20, 0x6e, 0x65, 0x77, 0x54, 0x65, 0x6d, 0x70, 0x6c, 0x61,
+  0x74, 0x65, 0x4e, 0x61, 0x6d, 0x65, 0x29, 0x0a, 0x0a, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f, 0x20, 0x73, 0x61, 0x76, 0x65,
+  0x20, 0x69, 0x6e, 0x20, 0x74, 0x68, 0x65, 0x20, 0x61, 0x75, 0x74, 0x6f,
+  0x73, 0x61, 0x76, 0x65, 0x20, 0x73, 0x6c, 0x6f, 0x74, 0x0a, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6c, 0x6f, 0x63, 0x61, 0x6c, 0x5f,
+  0x73, 0x74, 0x6f, 0x72, 0x61, 0x67, 0x65, 0x5f, 0x73, 0x65, 0x74, 0x44,
+  0x61, 0x74, 0x61, 0x46, 0x72, 0x6f, 0x6d, 0x4f, 0x62, 0x6a, 0x65, 0x63,
+  0x74, 0x28, 0x27, 0x75, 0x73, 0x65, 0x72, 0x5f, 0x74, 0x65, 0x6d, 0x70,
+  0x6c, 0x61, 0x74, 0x65, 0x73, 0x5f, 0x6c, 0x61, 0x73, 0x74, 0x27, 0x2c,
+  0x20, 0x6e, 0x65, 0x77, 0x54, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65,
+  0x29, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2f,
+  0x2f, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x6c, 0x6f, 0x61, 0x64, 0x20, 0x69,
+  0x74, 0x20, 0x62, 0x61, 0x63, 0x6b, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x61,
+  0x70, 0x70, 0x6c, 0x79, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x75, 0x73, 0x65, 0x72, 0x54, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74,
+  0x65, 0x4c, 0x6f, 0x61, 0x64, 0x41, 0x6e, 0x64, 0x41, 0x70, 0x70, 0x6c,
+  0x79, 0x41, 0x75, 0x74, 0x6f, 0x73, 0x61, 0x76, 0x65, 0x64, 0x28, 0x29,
+  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x20, 0x65, 0x6c, 0x73,
+  0x65, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x6c, 0x6f, 0x63, 0x61, 0x6c, 0x5f, 0x73, 0x74, 0x6f, 0x72, 0x61, 0x67,
+  0x65, 0x5f, 0x73, 0x65, 0x74, 0x44, 0x61, 0x74, 0x61, 0x46, 0x72, 0x6f,
+  0x6d, 0x4f, 0x62, 0x6a, 0x65, 0x63, 0x74, 0x28, 0x27, 0x75, 0x73, 0x65,
+  0x72, 0x5f, 0x74, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x73, 0x5f,
+  0x6c, 0x61, 0x73, 0x74, 0x27, 0x2c, 0x20, 0x7b, 0x20, 0x27, 0x6e, 0x61,
+  0x6d, 0x65, 0x27, 0x3a, 0x20, 0x73, 0x65, 0x6c, 0x65, 0x63, 0x74, 0x65,
+  0x64, 0x55, 0x73, 0x65, 0x72, 0x54, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74,
+  0x65, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e, 0x6e, 0x61, 0x6d, 0x65,
+  0x2c, 0x20, 0x27, 0x64, 0x61, 0x74, 0x61, 0x27, 0x3a, 0x20, 0x7b, 0x20,
+  0x27, 0x73, 0x65, 0x73, 0x73, 0x69, 0x6f, 0x6e, 0x27, 0x3a, 0x20, 0x73,
+  0x65, 0x73, 0x73, 0x69, 0x6f, 0x6e, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65,
+  0x2c, 0x20, 0x27, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x27, 0x3a, 0x20,
+  0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65,
+  0x20, 0x7d, 0x20, 0x7d, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20,
+  0x20, 0x63, 0x6f, 0x6e, 0x73, 0x6f, 0x6c, 0x65, 0x2e, 0x6c, 0x6f, 0x67,
+  0x28, 0x27, 0x43, 0x68, 0x65, 0x63, 0x6b, 0x69, 0x6e, 0x67, 0x20, 0x66,
+  0x6f, 0x72, 0x20, 0x61, 0x75, 0x74, 0x6f, 0x73, 0x61, 0x76, 0x65, 0x64,
+  0x20, 0x6c, 0x61, 0x73, 0x74, 0x20, 0x75, 0x73, 0x65, 0x64, 0x20, 0x74,
+  0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x27, 0x29, 0x0a, 0x20, 0x20,
+  0x20, 0x20, 0x75, 0x73, 0x65, 0x72, 0x54, 0x65, 0x6d, 0x70, 0x6c, 0x61,
+  0x74, 0x65, 0x4c, 0x6f, 0x61, 0x64, 0x41, 0x6e, 0x64, 0x41, 0x70, 0x70,
+  0x6c, 0x79, 0x41, 0x75, 0x74, 0x6f, 0x73, 0x61, 0x76, 0x65, 0x64, 0x28,
+  0x29, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2a, 0x20, 0x45, 0x4e,
+  0x44, 0x3a, 0x20, 0x53, 0x75, 0x70, 0x70, 0x6f, 0x72, 0x74, 0x20, 0x66,
+  0x6f, 0x72, 0x20, 0x73, 0x74, 0x6f, 0x72, 0x69, 0x6e, 0x67, 0x20, 0x70,
+  0x72, 0x6f, 0x6d, 0x70, 0x74, 0x20, 0x74, 0x65, 0x6d, 0x70, 0x6c, 0x61,
+  0x74, 0x65, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x70, 0x61, 0x72, 0x61,
+  0x6d, 0x65, 0x74, 0x65, 0x72, 0x73, 0x20, 0x69, 0x6e, 0x20, 0x62, 0x72,
+  0x6f, 0x77, 0x73, 0x65, 0x72, 0x73, 0x20, 0x4c, 0x6f, 0x63, 0x61, 0x6c,
+  0x53, 0x74, 0x6f, 0x72, 0x61, 0x67, 0x65, 0x20, 0x2a, 0x2f, 0x0a, 0x0a,
+  0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x6c, 0x6c,
+  0x61, 0x6d, 0x61, 0x53, 0x74, 0x61, 0x74, 0x73, 0x20, 0x3d, 0x20, 0x73,
+  0x69, 0x67, 0x6e, 0x61, 0x6c, 0x28, 0x6e, 0x75, 0x6c, 0x6c, 0x29, 0x0a,
+  0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x63, 0x6f,
+  0x6e, 0x74, 0x72, 0x6f, 0x6c, 0x6c, 0x65, 0x72, 0x20, 0x3d, 0x20, 0x73,
+  0x69, 0x67, 0x6e, 0x61, 0x6c, 0x28, 0x6e, 0x75, 0x6c, 0x6c, 0x29, 0x0a,
+  0x0a, 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f, 0x20, 0x63, 0x75, 0x72, 0x72,
+  0x65, 0x6e, 0x74, 0x6c, 0x79, 0x20, 0x67, 0x65, 0x6e, 0x65, 0x72, 0x61,
+  0x74, 0x69, 0x6e, 0x67, 0x20, 0x61, 0x20, 0x63, 0x6f, 0x6d, 0x70, 0x6c,
+  0x65, 0x74, 0x69, 0x6f, 0x6e, 0x3f, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63,
+  0x6f, 0x6e, 0x73, 0x74, 0x20, 0x67, 0x65, 0x6e, 0x65, 0x72, 0x61, 0x74,
+  0x69, 0x6e, 0x67, 0x20, 0x3d, 0x20, 0x63, 0x6f, 0x6d, 0x70, 0x75, 0x74,
+  0x65, 0x64, 0x28, 0x28, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x63, 0x6f, 0x6e,
   0x74, 0x72, 0x6f, 0x6c, 0x6c, 0x65, 0x72, 0x2e, 0x76, 0x61, 0x6c, 0x75,
-  0x65, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x74, 0x68, 0x72, 0x6f, 0x77, 0x20, 0x6e, 0x65, 0x77, 0x20, 0x45,
-  0x72, 0x72, 0x6f, 0x72, 0x28, 0x22, 0x61, 0x6c, 0x72, 0x65, 0x61, 0x64,
-  0x79, 0x20, 0x72, 0x75, 0x6e, 0x6e, 0x69, 0x6e, 0x67, 0x22, 0x29, 0x3b,
+  0x65, 0x20, 0x21, 0x3d, 0x20, 0x6e, 0x75, 0x6c, 0x6c, 0x29, 0x0a, 0x0a,
+  0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f, 0x20, 0x68, 0x61, 0x73, 0x20, 0x74,
+  0x68, 0x65, 0x20, 0x75, 0x73, 0x65, 0x72, 0x20, 0x73, 0x74, 0x61, 0x72,
+  0x74, 0x65, 0x64, 0x20, 0x61, 0x20, 0x63, 0x68, 0x61, 0x74, 0x3f, 0x0a,
+  0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x63, 0x68,
+  0x61, 0x74, 0x53, 0x74, 0x61, 0x72, 0x74, 0x65, 0x64, 0x20, 0x3d, 0x20,
+  0x63, 0x6f, 0x6d, 0x70, 0x75, 0x74, 0x65, 0x64, 0x28, 0x28, 0x29, 0x20,
+  0x3d, 0x3e, 0x20, 0x73, 0x65, 0x73, 0x73, 0x69, 0x6f, 0x6e, 0x2e, 0x76,
+  0x61, 0x6c, 0x75, 0x65, 0x2e, 0x74, 0x72, 0x61, 0x6e, 0x73, 0x63, 0x72,
+  0x69, 0x70, 0x74, 0x2e, 0x6c, 0x65, 0x6e, 0x67, 0x74, 0x68, 0x20, 0x3e,
+  0x20, 0x30, 0x29, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e,
+  0x73, 0x74, 0x20, 0x74, 0x72, 0x61, 0x6e, 0x73, 0x63, 0x72, 0x69, 0x70,
+  0x74, 0x55, 0x70, 0x64, 0x61, 0x74, 0x65, 0x20, 0x3d, 0x20, 0x28, 0x74,
+  0x72, 0x61, 0x6e, 0x73, 0x63, 0x72, 0x69, 0x70, 0x74, 0x29, 0x20, 0x3d,
+  0x3e, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x73, 0x65,
+  0x73, 0x73, 0x69, 0x6f, 0x6e, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20,
+  0x3d, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x2e, 0x2e, 0x2e, 0x73, 0x65, 0x73, 0x73, 0x69, 0x6f, 0x6e, 0x2e, 0x76,
+  0x61, 0x6c, 0x75, 0x65, 0x2c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x74, 0x72, 0x61, 0x6e, 0x73, 0x63, 0x72, 0x69, 0x70, 0x74,
   0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x72, 0x6f, 0x6c, 0x6c, 0x65,
-  0x72, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x3d, 0x20, 0x6e, 0x65,
-  0x77, 0x20, 0x41, 0x62, 0x6f, 0x72, 0x74, 0x43, 0x6f, 0x6e, 0x74, 0x72,
-  0x6f, 0x6c, 0x6c, 0x65, 0x72, 0x28, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x66, 0x6f, 0x72, 0x20, 0x61, 0x77, 0x61, 0x69, 0x74,
-  0x20, 0x28, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x63, 0x68, 0x75, 0x6e,
-  0x6b, 0x20, 0x6f, 0x66, 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x28, 0x70,
-  0x72, 0x6f, 0x6d, 0x70, 0x74, 0x2c, 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61,
-  0x50, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2c, 0x20, 0x7b, 0x63, 0x6f, 0x6e,
-  0x74, 0x72, 0x6f, 0x6c, 0x6c, 0x65, 0x72, 0x3a, 0x20, 0x63, 0x6f, 0x6e,
-  0x74, 0x72, 0x6f, 0x6c, 0x6c, 0x65, 0x72, 0x2e, 0x76, 0x61, 0x6c, 0x75,
-  0x65, 0x7d, 0x29, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x64, 0x61, 0x74,
-  0x61, 0x20, 0x3d, 0x20, 0x63, 0x68, 0x75, 0x6e, 0x6b, 0x2e, 0x64, 0x61,
-  0x74, 0x61, 0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x69, 0x66, 0x20, 0x28, 0x64, 0x61, 0x74, 0x61, 0x2e, 0x73, 0x74,
-  0x6f, 0x70, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x77, 0x68, 0x69, 0x6c, 0x65, 0x20, 0x28, 0x0a,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x63, 0x75, 0x72, 0x72, 0x65, 0x6e, 0x74, 0x4d, 0x65, 0x73, 0x73, 0x61,
-  0x67, 0x65, 0x73, 0x2e, 0x6c, 0x65, 0x6e, 0x67, 0x74, 0x68, 0x20, 0x3e,
-  0x20, 0x30, 0x20, 0x26, 0x26, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x75, 0x72, 0x72, 0x65, 0x6e,
-  0x74, 0x4d, 0x65, 0x73, 0x73, 0x61, 0x67, 0x65, 0x73, 0x5b, 0x63, 0x75,
+  0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f, 0x20, 0x73,
+  0x69, 0x6d, 0x70, 0x6c, 0x65, 0x20, 0x74, 0x65, 0x6d, 0x70, 0x6c, 0x61,
+  0x74, 0x65, 0x20, 0x72, 0x65, 0x70, 0x6c, 0x61, 0x63, 0x65, 0x0a, 0x20,
+  0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x74, 0x65, 0x6d,
+  0x70, 0x6c, 0x61, 0x74, 0x65, 0x20, 0x3d, 0x20, 0x28, 0x73, 0x74, 0x72,
+  0x2c, 0x20, 0x65, 0x78, 0x74, 0x72, 0x61, 0x53, 0x65, 0x74, 0x74, 0x69,
+  0x6e, 0x67, 0x73, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x6c, 0x65, 0x74, 0x20, 0x73, 0x65, 0x74, 0x74,
+  0x69, 0x6e, 0x67, 0x73, 0x20, 0x3d, 0x20, 0x73, 0x65, 0x73, 0x73, 0x69,
+  0x6f, 0x6e, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x3b, 0x0a, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x65, 0x78, 0x74, 0x72,
+  0x61, 0x53, 0x65, 0x74, 0x74, 0x69, 0x6e, 0x67, 0x73, 0x29, 0x20, 0x7b,
+  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x73, 0x65, 0x74,
+  0x74, 0x69, 0x6e, 0x67, 0x73, 0x20, 0x3d, 0x20, 0x7b, 0x20, 0x2e, 0x2e,
+  0x2e, 0x73, 0x65, 0x74, 0x74, 0x69, 0x6e, 0x67, 0x73, 0x2c, 0x20, 0x2e,
+  0x2e, 0x2e, 0x65, 0x78, 0x74, 0x72, 0x61, 0x53, 0x65, 0x74, 0x74, 0x69,
+  0x6e, 0x67, 0x73, 0x20, 0x7d, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x74,
+  0x75, 0x72, 0x6e, 0x20, 0x53, 0x74, 0x72, 0x69, 0x6e, 0x67, 0x28, 0x73,
+  0x74, 0x72, 0x29, 0x2e, 0x72, 0x65, 0x70, 0x6c, 0x61, 0x63, 0x65, 0x41,
+  0x6c, 0x6c, 0x28, 0x2f, 0x5c, 0x7b, 0x5c, 0x7b, 0x28, 0x2e, 0x2a, 0x3f,
+  0x29, 0x5c, 0x7d, 0x5c, 0x7d, 0x2f, 0x67, 0x2c, 0x20, 0x28, 0x5f, 0x2c,
+  0x20, 0x6b, 0x65, 0x79, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x74, 0x65, 0x6d,
+  0x70, 0x6c, 0x61, 0x74, 0x65, 0x28, 0x73, 0x65, 0x74, 0x74, 0x69, 0x6e,
+  0x67, 0x73, 0x5b, 0x6b, 0x65, 0x79, 0x5d, 0x29, 0x29, 0x3b, 0x0a, 0x20,
+  0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x61, 0x73,
+  0x79, 0x6e, 0x63, 0x20, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e,
+  0x20, 0x72, 0x75, 0x6e, 0x4c, 0x6c, 0x61, 0x6d, 0x61, 0x28, 0x70, 0x72,
+  0x6f, 0x6d, 0x70, 0x74, 0x2c, 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x50,
+  0x61, 0x72, 0x61, 0x6d, 0x73, 0x2c, 0x20, 0x63, 0x68, 0x61, 0x72, 0x29,
+  0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e,
+  0x73, 0x74, 0x20, 0x63, 0x75, 0x72, 0x72, 0x65, 0x6e, 0x74, 0x4d, 0x65,
+  0x73, 0x73, 0x61, 0x67, 0x65, 0x73, 0x20, 0x3d, 0x20, 0x5b, 0x5d, 0x3b,
+  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74,
+  0x20, 0x68, 0x69, 0x73, 0x74, 0x6f, 0x72, 0x79, 0x20, 0x3d, 0x20, 0x73,
+  0x65, 0x73, 0x73, 0x69, 0x6f, 0x6e, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65,
+  0x2e, 0x74, 0x72, 0x61, 0x6e, 0x73, 0x63, 0x72, 0x69, 0x70, 0x74, 0x3b,
+  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x63,
+  0x6f, 0x6e, 0x74, 0x72, 0x6f, 0x6c, 0x6c, 0x65, 0x72, 0x2e, 0x76, 0x61,
+  0x6c, 0x75, 0x65, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x74, 0x68, 0x72, 0x6f, 0x77, 0x20, 0x6e, 0x65, 0x77,
+  0x20, 0x45, 0x72, 0x72, 0x6f, 0x72, 0x28, 0x22, 0x61, 0x6c, 0x72, 0x65,
+  0x61, 0x64, 0x79, 0x20, 0x72, 0x75, 0x6e, 0x6e, 0x69, 0x6e, 0x67, 0x22,
+  0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x72, 0x6f, 0x6c,
+  0x6c, 0x65, 0x72, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x3d, 0x20,
+  0x6e, 0x65, 0x77, 0x20, 0x41, 0x62, 0x6f, 0x72, 0x74, 0x43, 0x6f, 0x6e,
+  0x74, 0x72, 0x6f, 0x6c, 0x6c, 0x65, 0x72, 0x28, 0x29, 0x3b, 0x0a, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x66, 0x6f, 0x72, 0x20, 0x61, 0x77, 0x61,
+  0x69, 0x74, 0x20, 0x28, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x63, 0x68,
+  0x75, 0x6e, 0x6b, 0x20, 0x6f, 0x66, 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61,
+  0x28, 0x70, 0x72, 0x6f, 0x6d, 0x70, 0x74, 0x2c, 0x20, 0x6c, 0x6c, 0x61,
+  0x6d, 0x61, 0x50, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2c, 0x20, 0x7b, 0x20,
+  0x63, 0x6f, 0x6e, 0x74, 0x72, 0x6f, 0x6c, 0x6c, 0x65, 0x72, 0x3a, 0x20,
+  0x63, 0x6f, 0x6e, 0x74, 0x72, 0x6f, 0x6c, 0x6c, 0x65, 0x72, 0x2e, 0x76,
+  0x61, 0x6c, 0x75, 0x65, 0x20, 0x7d, 0x29, 0x29, 0x20, 0x7b, 0x0a, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74,
+  0x20, 0x64, 0x61, 0x74, 0x61, 0x20, 0x3d, 0x20, 0x63, 0x68, 0x75, 0x6e,
+  0x6b, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x64, 0x61, 0x74,
+  0x61, 0x2e, 0x73, 0x74, 0x6f, 0x70, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x77, 0x68, 0x69, 0x6c,
+  0x65, 0x20, 0x28, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x63, 0x75, 0x72, 0x72, 0x65, 0x6e, 0x74, 0x4d,
+  0x65, 0x73, 0x73, 0x61, 0x67, 0x65, 0x73, 0x2e, 0x6c, 0x65, 0x6e, 0x67,
+  0x74, 0x68, 0x20, 0x3e, 0x20, 0x30, 0x20, 0x26, 0x26, 0x0a, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x75,
   0x72, 0x72, 0x65, 0x6e, 0x74, 0x4d, 0x65, 0x73, 0x73, 0x61, 0x67, 0x65,
-  0x73, 0x2e, 0x6c, 0x65, 0x6e, 0x67, 0x74, 0x68, 0x20, 0x2d, 0x20, 0x31,
-  0x5d, 0x2e, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x2e, 0x6d, 0x61,
-  0x74, 0x63, 0x68, 0x28, 0x2f, 0x5c, 0x6e, 0x24, 0x2f, 0x29, 0x20, 0x21,
-  0x3d, 0x20, 0x6e, 0x75, 0x6c, 0x6c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x29, 0x20, 0x7b, 0x0a, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63,
-  0x75, 0x72, 0x72, 0x65, 0x6e, 0x74, 0x4d, 0x65, 0x73, 0x73, 0x61, 0x67,
-  0x65, 0x73, 0x2e, 0x70, 0x6f, 0x70, 0x28, 0x29, 0x3b, 0x0a, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x74, 0x72, 0x61, 0x6e,
-  0x73, 0x63, 0x72, 0x69, 0x70, 0x74, 0x55, 0x70, 0x64, 0x61, 0x74, 0x65,
-  0x28, 0x5b, 0x2e, 0x2e, 0x2e, 0x68, 0x69, 0x73, 0x74, 0x6f, 0x72, 0x79,
-  0x2c, 0x20, 0x5b, 0x63, 0x68, 0x61, 0x72, 0x2c, 0x20, 0x63, 0x75, 0x72,
-  0x72, 0x65, 0x6e, 0x74, 0x4d, 0x65, 0x73, 0x73, 0x61, 0x67, 0x65, 0x73,
-  0x5d, 0x5d, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x6f, 0x6c, 0x65, 0x2e, 0x6c, 0x6f,
-  0x67, 0x28, 0x22, 0x43, 0x6f, 0x6d, 0x70, 0x6c, 0x65, 0x74, 0x69, 0x6f,
-  0x6e, 0x20, 0x66, 0x69, 0x6e, 0x69, 0x73, 0x68, 0x65, 0x64, 0x3a, 0x20,
-  0x27, 0x22, 0x2c, 0x20, 0x63, 0x75, 0x72, 0x72, 0x65, 0x6e, 0x74, 0x4d,
-  0x65, 0x73, 0x73, 0x61, 0x67, 0x65, 0x73, 0x2e, 0x6d, 0x61, 0x70, 0x28,
-  0x6d, 0x73, 0x67, 0x20, 0x3d, 0x3e, 0x20, 0x6d, 0x73, 0x67, 0x2e, 0x63,
-  0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x29, 0x2e, 0x6a, 0x6f, 0x69, 0x6e,
-  0x28, 0x27, 0x27, 0x29, 0x2c, 0x20, 0x22, 0x27, 0x2c, 0x20, 0x73, 0x75,
-  0x6d, 0x6d, 0x61, 0x72, 0x79, 0x3a, 0x20, 0x22, 0x2c, 0x20, 0x64, 0x61,
-  0x74, 0x61, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x7d, 0x20, 0x65, 0x6c, 0x73, 0x65, 0x20, 0x7b, 0x0a, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x75, 0x72, 0x72,
-  0x65, 0x6e, 0x74, 0x4d, 0x65, 0x73, 0x73, 0x61, 0x67, 0x65, 0x73, 0x2e,
-  0x70, 0x75, 0x73, 0x68, 0x28, 0x64, 0x61, 0x74, 0x61, 0x29, 0x3b, 0x0a,
+  0x73, 0x5b, 0x63, 0x75, 0x72, 0x72, 0x65, 0x6e, 0x74, 0x4d, 0x65, 0x73,
+  0x73, 0x61, 0x67, 0x65, 0x73, 0x2e, 0x6c, 0x65, 0x6e, 0x67, 0x74, 0x68,
+  0x20, 0x2d, 0x20, 0x31, 0x5d, 0x2e, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e,
+  0x74, 0x2e, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x28, 0x2f, 0x5c, 0x6e, 0x24,
+  0x2f, 0x29, 0x20, 0x21, 0x3d, 0x20, 0x6e, 0x75, 0x6c, 0x6c, 0x0a, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x29, 0x20, 0x7b,
+  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x63, 0x75, 0x72, 0x72, 0x65, 0x6e, 0x74, 0x4d, 0x65, 0x73, 0x73,
+  0x61, 0x67, 0x65, 0x73, 0x2e, 0x70, 0x6f, 0x70, 0x28, 0x29, 0x3b, 0x0a,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a,
   0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x74, 0x72,
   0x61, 0x6e, 0x73, 0x63, 0x72, 0x69, 0x70, 0x74, 0x55, 0x70, 0x64, 0x61,
   0x74, 0x65, 0x28, 0x5b, 0x2e, 0x2e, 0x2e, 0x68, 0x69, 0x73, 0x74, 0x6f,
   0x72, 0x79, 0x2c, 0x20, 0x5b, 0x63, 0x68, 0x61, 0x72, 0x2c, 0x20, 0x63,
   0x75, 0x72, 0x72, 0x65, 0x6e, 0x74, 0x4d, 0x65, 0x73, 0x73, 0x61, 0x67,
   0x65, 0x73, 0x5d, 0x5d, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x69, 0x66, 0x20, 0x28, 0x64, 0x61, 0x74, 0x61, 0x2e, 0x74, 0x69,
-  0x6d, 0x69, 0x6e, 0x67, 0x73, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61,
-  0x53, 0x74, 0x61, 0x74, 0x73, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20,
-  0x3d, 0x20, 0x64, 0x61, 0x74, 0x61, 0x2e, 0x74, 0x69, 0x6d, 0x69, 0x6e,
-  0x67, 0x73, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x72, 0x6f, 0x6c,
-  0x6c, 0x65, 0x72, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x3d, 0x20,
-  0x6e, 0x75, 0x6c, 0x6c, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a,
-  0x0a, 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f, 0x20, 0x73, 0x65, 0x6e, 0x64,
-  0x20, 0x6d, 0x65, 0x73, 0x73, 0x61, 0x67, 0x65, 0x20, 0x74, 0x6f, 0x20,
-  0x73, 0x65, 0x72, 0x76, 0x65, 0x72, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63,
-  0x6f, 0x6e, 0x73, 0x74, 0x20, 0x63, 0x68, 0x61, 0x74, 0x20, 0x3d, 0x20,
-  0x61, 0x73, 0x79, 0x6e, 0x63, 0x20, 0x28, 0x6d, 0x73, 0x67, 0x29, 0x20,
-  0x3d, 0x3e, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69,
-  0x66, 0x20, 0x28, 0x63, 0x6f, 0x6e, 0x74, 0x72, 0x6f, 0x6c, 0x6c, 0x65,
-  0x72, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x29, 0x20, 0x7b, 0x0a, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x6f,
-  0x6c, 0x65, 0x2e, 0x6c, 0x6f, 0x67, 0x28, 0x27, 0x61, 0x6c, 0x72, 0x65,
-  0x61, 0x64, 0x79, 0x20, 0x72, 0x75, 0x6e, 0x6e, 0x69, 0x6e, 0x67, 0x2e,
-  0x2e, 0x2e, 0x27, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x3b, 0x0a, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x74, 0x72, 0x61, 0x6e, 0x73, 0x63, 0x72, 0x69, 0x70, 0x74, 0x55,
-  0x70, 0x64, 0x61, 0x74, 0x65, 0x28, 0x5b, 0x2e, 0x2e, 0x2e, 0x73, 0x65,
+  0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x6f, 0x6c, 0x65, 0x2e,
+  0x6c, 0x6f, 0x67, 0x28, 0x22, 0x43, 0x6f, 0x6d, 0x70, 0x6c, 0x65, 0x74,
+  0x69, 0x6f, 0x6e, 0x20, 0x66, 0x69, 0x6e, 0x69, 0x73, 0x68, 0x65, 0x64,
+  0x3a, 0x20, 0x27, 0x22, 0x2c, 0x20, 0x63, 0x75, 0x72, 0x72, 0x65, 0x6e,
+  0x74, 0x4d, 0x65, 0x73, 0x73, 0x61, 0x67, 0x65, 0x73, 0x2e, 0x6d, 0x61,
+  0x70, 0x28, 0x6d, 0x73, 0x67, 0x20, 0x3d, 0x3e, 0x20, 0x6d, 0x73, 0x67,
+  0x2e, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x29, 0x2e, 0x6a, 0x6f,
+  0x69, 0x6e, 0x28, 0x27, 0x27, 0x29, 0x2c, 0x20, 0x22, 0x27, 0x2c, 0x20,
+  0x73, 0x75, 0x6d, 0x6d, 0x61, 0x72, 0x79, 0x3a, 0x20, 0x22, 0x2c, 0x20,
+  0x64, 0x61, 0x74, 0x61, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x7d, 0x20, 0x65, 0x6c, 0x73, 0x65, 0x20, 0x7b, 0x0a,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x75,
+  0x72, 0x72, 0x65, 0x6e, 0x74, 0x4d, 0x65, 0x73, 0x73, 0x61, 0x67, 0x65,
+  0x73, 0x2e, 0x70, 0x75, 0x73, 0x68, 0x28, 0x64, 0x61, 0x74, 0x61, 0x29,
+  0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x73, 0x6c, 0x6f, 0x74, 0x5f, 0x69, 0x64, 0x20, 0x3d, 0x20, 0x64, 0x61,
+  0x74, 0x61, 0x2e, 0x73, 0x6c, 0x6f, 0x74, 0x5f, 0x69, 0x64, 0x3b, 0x0a,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x66,
+  0x20, 0x28, 0x73, 0x65, 0x6c, 0x65, 0x63, 0x74, 0x65, 0x64, 0x5f, 0x69,
+  0x6d, 0x61, 0x67, 0x65, 0x20, 0x26, 0x26, 0x20, 0x21, 0x64, 0x61, 0x74,
+  0x61, 0x2e, 0x6d, 0x75, 0x6c, 0x74, 0x69, 0x6d, 0x6f, 0x64, 0x61, 0x6c,
+  0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x61, 0x6c, 0x65, 0x72, 0x74, 0x28, 0x22, 0x54,
+  0x68, 0x65, 0x20, 0x73, 0x65, 0x72, 0x76, 0x65, 0x72, 0x20, 0x77, 0x61,
+  0x73, 0x20, 0x6e, 0x6f, 0x74, 0x20, 0x63, 0x6f, 0x6d, 0x70, 0x69, 0x6c,
+  0x65, 0x64, 0x20, 0x66, 0x6f, 0x72, 0x20, 0x6d, 0x75, 0x6c, 0x74, 0x69,
+  0x6d, 0x6f, 0x64, 0x61, 0x6c, 0x20, 0x6f, 0x72, 0x20, 0x74, 0x68, 0x65,
+  0x20, 0x6d, 0x6f, 0x64, 0x65, 0x6c, 0x20, 0x70, 0x72, 0x6f, 0x6a, 0x65,
+  0x63, 0x74, 0x6f, 0x72, 0x20, 0x63, 0x61, 0x6e, 0x27, 0x74, 0x20, 0x62,
+  0x65, 0x20, 0x6c, 0x6f, 0x61, 0x64, 0x65, 0x64, 0x2e, 0x22, 0x29, 0x3b,
+  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x3b, 0x0a, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x74, 0x72, 0x61, 0x6e, 0x73,
+  0x63, 0x72, 0x69, 0x70, 0x74, 0x55, 0x70, 0x64, 0x61, 0x74, 0x65, 0x28,
+  0x5b, 0x2e, 0x2e, 0x2e, 0x68, 0x69, 0x73, 0x74, 0x6f, 0x72, 0x79, 0x2c,
+  0x20, 0x5b, 0x63, 0x68, 0x61, 0x72, 0x2c, 0x20, 0x63, 0x75, 0x72, 0x72,
+  0x65, 0x6e, 0x74, 0x4d, 0x65, 0x73, 0x73, 0x61, 0x67, 0x65, 0x73, 0x5d,
+  0x5d, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d,
+  0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x66,
+  0x20, 0x28, 0x64, 0x61, 0x74, 0x61, 0x2e, 0x74, 0x69, 0x6d, 0x69, 0x6e,
+  0x67, 0x73, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x53, 0x74, 0x61,
+  0x74, 0x73, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x3d, 0x20, 0x64,
+  0x61, 0x74, 0x61, 0x2e, 0x74, 0x69, 0x6d, 0x69, 0x6e, 0x67, 0x73, 0x3b,
+  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x72, 0x6f, 0x6c, 0x6c, 0x65, 0x72,
+  0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x3d, 0x20, 0x6e, 0x75, 0x6c,
+  0x6c, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20,
+  0x20, 0x20, 0x2f, 0x2f, 0x20, 0x73, 0x65, 0x6e, 0x64, 0x20, 0x6d, 0x65,
+  0x73, 0x73, 0x61, 0x67, 0x65, 0x20, 0x74, 0x6f, 0x20, 0x73, 0x65, 0x72,
+  0x76, 0x65, 0x72, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73,
+  0x74, 0x20, 0x63, 0x68, 0x61, 0x74, 0x20, 0x3d, 0x20, 0x61, 0x73, 0x79,
+  0x6e, 0x63, 0x20, 0x28, 0x6d, 0x73, 0x67, 0x29, 0x20, 0x3d, 0x3e, 0x20,
+  0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28,
+  0x63, 0x6f, 0x6e, 0x74, 0x72, 0x6f, 0x6c, 0x6c, 0x65, 0x72, 0x2e, 0x76,
+  0x61, 0x6c, 0x75, 0x65, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x6f, 0x6c, 0x65, 0x2e,
+  0x6c, 0x6f, 0x67, 0x28, 0x27, 0x61, 0x6c, 0x72, 0x65, 0x61, 0x64, 0x79,
+  0x20, 0x72, 0x75, 0x6e, 0x6e, 0x69, 0x6e, 0x67, 0x2e, 0x2e, 0x2e, 0x27,
+  0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72,
+  0x65, 0x74, 0x75, 0x72, 0x6e, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x74, 0x72,
+  0x61, 0x6e, 0x73, 0x63, 0x72, 0x69, 0x70, 0x74, 0x55, 0x70, 0x64, 0x61,
+  0x74, 0x65, 0x28, 0x5b, 0x2e, 0x2e, 0x2e, 0x73, 0x65, 0x73, 0x73, 0x69,
+  0x6f, 0x6e, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e, 0x74, 0x72, 0x61,
+  0x6e, 0x73, 0x63, 0x72, 0x69, 0x70, 0x74, 0x2c, 0x20, 0x5b, 0x22, 0x7b,
+  0x7b, 0x75, 0x73, 0x65, 0x72, 0x7d, 0x7d, 0x22, 0x2c, 0x20, 0x6d, 0x73,
+  0x67, 0x5d, 0x5d, 0x29, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x6c, 0x65, 0x74, 0x20, 0x70, 0x72, 0x6f, 0x6d, 0x70, 0x74, 0x20, 0x3d,
+  0x20, 0x74, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x28, 0x73, 0x65,
   0x73, 0x73, 0x69, 0x6f, 0x6e, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e,
-  0x74, 0x72, 0x61, 0x6e, 0x73, 0x63, 0x72, 0x69, 0x70, 0x74, 0x2c, 0x20,
-  0x5b, 0x22, 0x7b, 0x7b, 0x75, 0x73, 0x65, 0x72, 0x7d, 0x7d, 0x22, 0x2c,
-  0x20, 0x6d, 0x73, 0x67, 0x5d, 0x5d, 0x29, 0x0a, 0x0a, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x70, 0x72, 0x6f,
-  0x6d, 0x70, 0x74, 0x20, 0x3d, 0x20, 0x74, 0x65, 0x6d, 0x70, 0x6c, 0x61,
-  0x74, 0x65, 0x28, 0x73, 0x65, 0x73, 0x73, 0x69, 0x6f, 0x6e, 0x2e, 0x76,
-  0x61, 0x6c, 0x75, 0x65, 0x2e, 0x74, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74,
-  0x65, 0x2c, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x6d, 0x65, 0x73, 0x73, 0x61, 0x67, 0x65, 0x3a, 0x20, 0x6d, 0x73,
-  0x67, 0x2c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x68,
-  0x69, 0x73, 0x74, 0x6f, 0x72, 0x79, 0x3a, 0x20, 0x73, 0x65, 0x73, 0x73,
-  0x69, 0x6f, 0x6e, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e, 0x74, 0x72,
-  0x61, 0x6e, 0x73, 0x63, 0x72, 0x69, 0x70, 0x74, 0x2e, 0x66, 0x6c, 0x61,
-  0x74, 0x4d, 0x61, 0x70, 0x28, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x28, 0x5b, 0x6e, 0x61, 0x6d, 0x65, 0x2c, 0x20,
-  0x64, 0x61, 0x74, 0x61, 0x5d, 0x29, 0x20, 0x3d, 0x3e, 0x0a, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x74, 0x65,
-  0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x28, 0x0a, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x73, 0x65,
-  0x73, 0x73, 0x69, 0x6f, 0x6e, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e,
-  0x68, 0x69, 0x73, 0x74, 0x6f, 0x72, 0x79, 0x54, 0x65, 0x6d, 0x70, 0x6c,
-  0x61, 0x74, 0x65, 0x2c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20,
+  0x74, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x2c, 0x20, 0x7b, 0x0a,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6d, 0x65, 0x73, 0x73,
+  0x61, 0x67, 0x65, 0x3a, 0x20, 0x6d, 0x73, 0x67, 0x2c, 0x0a, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x68, 0x69, 0x73, 0x74, 0x6f, 0x72,
+  0x79, 0x3a, 0x20, 0x73, 0x65, 0x73, 0x73, 0x69, 0x6f, 0x6e, 0x2e, 0x76,
+  0x61, 0x6c, 0x75, 0x65, 0x2e, 0x74, 0x72, 0x61, 0x6e, 0x73, 0x63, 0x72,
+  0x69, 0x70, 0x74, 0x2e, 0x66, 0x6c, 0x61, 0x74, 0x4d, 0x61, 0x70, 0x28,
+  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x28,
+  0x5b, 0x6e, 0x61, 0x6d, 0x65, 0x2c, 0x20, 0x64, 0x61, 0x74, 0x61, 0x5d,
+  0x29, 0x20, 0x3d, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x74, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74,
+  0x65, 0x28, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x73, 0x65, 0x73, 0x73, 0x69, 0x6f, 0x6e,
+  0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e, 0x68, 0x69, 0x73, 0x74, 0x6f,
+  0x72, 0x79, 0x54, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x2c, 0x0a,
   0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x6e, 0x61, 0x6d, 0x65, 0x2c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6d,
-  0x65, 0x73, 0x73, 0x61, 0x67, 0x65, 0x3a, 0x20, 0x41, 0x72, 0x72, 0x61,
-  0x79, 0x2e, 0x69, 0x73, 0x41, 0x72, 0x72, 0x61, 0x79, 0x28, 0x64, 0x61,
-  0x74, 0x61, 0x29, 0x20, 0x3f, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6e, 0x61, 0x6d, 0x65,
+  0x2c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6d, 0x65, 0x73, 0x73, 0x61, 0x67,
+  0x65, 0x3a, 0x20, 0x41, 0x72, 0x72, 0x61, 0x79, 0x2e, 0x69, 0x73, 0x41,
+  0x72, 0x72, 0x61, 0x79, 0x28, 0x64, 0x61, 0x74, 0x61, 0x29, 0x20, 0x3f,
+  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x64, 0x61, 0x74, 0x61, 0x2e,
+  0x6d, 0x61, 0x70, 0x28, 0x6d, 0x73, 0x67, 0x20, 0x3d, 0x3e, 0x20, 0x6d,
+  0x73, 0x67, 0x2e, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x29, 0x2e,
+  0x6a, 0x6f, 0x69, 0x6e, 0x28, 0x27, 0x27, 0x29, 0x2e, 0x72, 0x65, 0x70,
+  0x6c, 0x61, 0x63, 0x65, 0x28, 0x2f, 0x5e, 0x5c, 0x73, 0x2f, 0x2c, 0x20,
+  0x27, 0x27, 0x29, 0x20, 0x3a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
   0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x64, 0x61, 0x74, 0x61, 0x2e, 0x6d, 0x61, 0x70, 0x28, 0x6d, 0x73, 0x67,
-  0x20, 0x3d, 0x3e, 0x20, 0x6d, 0x73, 0x67, 0x2e, 0x63, 0x6f, 0x6e, 0x74,
-  0x65, 0x6e, 0x74, 0x29, 0x2e, 0x6a, 0x6f, 0x69, 0x6e, 0x28, 0x27, 0x27,
-  0x29, 0x2e, 0x72, 0x65, 0x70, 0x6c, 0x61, 0x63, 0x65, 0x28, 0x2f, 0x5e,
-  0x5c, 0x73, 0x2f, 0x2c, 0x20, 0x27, 0x27, 0x29, 0x20, 0x3a, 0x0a, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x64, 0x61, 0x74, 0x61, 0x2c, 0x0a, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x29, 0x2e, 0x6a, 0x6f, 0x69, 0x6e, 0x28, 0x22, 0x5c, 0x6e, 0x22,
-  0x29, 0x2c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x29, 0x3b,
-  0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x61, 0x77, 0x61, 0x69,
+  0x64, 0x61, 0x74, 0x61, 0x2c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x29, 0x0a,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x29, 0x2e, 0x6a, 0x6f,
+  0x69, 0x6e, 0x28, 0x22, 0x5c, 0x6e, 0x22, 0x29, 0x2c, 0x0a, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x7d, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x73, 0x65, 0x6c, 0x65, 0x63, 0x74,
+  0x65, 0x64, 0x5f, 0x69, 0x6d, 0x61, 0x67, 0x65, 0x29, 0x20, 0x7b, 0x0a,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x70, 0x72, 0x6f, 0x6d,
+  0x70, 0x74, 0x20, 0x3d, 0x20, 0x60, 0x41, 0x20, 0x63, 0x68, 0x61, 0x74,
+  0x20, 0x62, 0x65, 0x74, 0x77, 0x65, 0x65, 0x6e, 0x20, 0x61, 0x20, 0x63,
+  0x75, 0x72, 0x69, 0x6f, 0x75, 0x73, 0x20, 0x68, 0x75, 0x6d, 0x61, 0x6e,
+  0x20, 0x61, 0x6e, 0x64, 0x20, 0x61, 0x6e, 0x20, 0x61, 0x72, 0x74, 0x69,
+  0x66, 0x69, 0x63, 0x69, 0x61, 0x6c, 0x20, 0x69, 0x6e, 0x74, 0x65, 0x6c,
+  0x6c, 0x69, 0x67, 0x65, 0x6e, 0x63, 0x65, 0x20, 0x61, 0x73, 0x73, 0x69,
+  0x73, 0x74, 0x61, 0x6e, 0x74, 0x2e, 0x20, 0x54, 0x68, 0x65, 0x20, 0x61,
+  0x73, 0x73, 0x69, 0x73, 0x74, 0x61, 0x6e, 0x74, 0x20, 0x67, 0x69, 0x76,
+  0x65, 0x73, 0x20, 0x68, 0x65, 0x6c, 0x70, 0x66, 0x75, 0x6c, 0x2c, 0x20,
+  0x64, 0x65, 0x74, 0x61, 0x69, 0x6c, 0x65, 0x64, 0x2c, 0x20, 0x61, 0x6e,
+  0x64, 0x20, 0x70, 0x6f, 0x6c, 0x69, 0x74, 0x65, 0x20, 0x61, 0x6e, 0x73,
+  0x77, 0x65, 0x72, 0x73, 0x20, 0x74, 0x6f, 0x20, 0x74, 0x68, 0x65, 0x20,
+  0x68, 0x75, 0x6d, 0x61, 0x6e, 0x27, 0x73, 0x20, 0x71, 0x75, 0x65, 0x73,
+  0x74, 0x69, 0x6f, 0x6e, 0x73, 0x2e, 0x5c, 0x6e, 0x55, 0x53, 0x45, 0x52,
+  0x3a, 0x5b, 0x69, 0x6d, 0x67, 0x2d, 0x31, 0x30, 0x5d, 0x24, 0x7b, 0x6d,
+  0x73, 0x67, 0x7d, 0x5c, 0x6e, 0x41, 0x53, 0x53, 0x49, 0x53, 0x54, 0x41,
+  0x4e, 0x54, 0x3a, 0x60, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x61, 0x77, 0x61, 0x69,
   0x74, 0x20, 0x72, 0x75, 0x6e, 0x4c, 0x6c, 0x61, 0x6d, 0x61, 0x28, 0x70,
   0x72, 0x6f, 0x6d, 0x70, 0x74, 0x2c, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20,
   0x20, 0x20, 0x20, 0x20, 0x20, 0x2e, 0x2e, 0x2e, 0x70, 0x61, 0x72, 0x61,
   0x6d, 0x73, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2c, 0x0a, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x73, 0x74, 0x6f, 0x70, 0x3a, 0x20,
-  0x5b, 0x22, 0x3c, 0x2f, 0x73, 0x3e, 0x22, 0x2c, 0x20, 0x74, 0x65, 0x6d,
-  0x70, 0x6c, 0x61, 0x74, 0x65, 0x28, 0x22, 0x7b, 0x7b, 0x63, 0x68, 0x61,
-  0x72, 0x7d, 0x7d, 0x3a, 0x22, 0x29, 0x2c, 0x20, 0x74, 0x65, 0x6d, 0x70,
-  0x6c, 0x61, 0x74, 0x65, 0x28, 0x22, 0x7b, 0x7b, 0x75, 0x73, 0x65, 0x72,
-  0x7d, 0x7d, 0x3a, 0x22, 0x29, 0x5d, 0x2c, 0x0a, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x7d, 0x2c, 0x20, 0x22, 0x7b, 0x7b, 0x63, 0x68, 0x61, 0x72,
-  0x7d, 0x7d, 0x22, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a,
-  0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x72,
-  0x75, 0x6e, 0x43, 0x6f, 0x6d, 0x70, 0x6c, 0x65, 0x74, 0x69, 0x6f, 0x6e,
-  0x20, 0x3d, 0x20, 0x61, 0x73, 0x79, 0x6e, 0x63, 0x20, 0x28, 0x29, 0x20,
-  0x3d, 0x3e, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69,
-  0x66, 0x20, 0x28, 0x63, 0x6f, 0x6e, 0x74, 0x72, 0x6f, 0x6c, 0x6c, 0x65,
-  0x72, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x29, 0x20, 0x7b, 0x0a, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x6f,
-  0x6c, 0x65, 0x2e, 0x6c, 0x6f, 0x67, 0x28, 0x27, 0x61, 0x6c, 0x72, 0x65,
-  0x61, 0x64, 0x79, 0x20, 0x72, 0x75, 0x6e, 0x6e, 0x69, 0x6e, 0x67, 0x2e,
-  0x2e, 0x2e, 0x27, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x3b, 0x0a, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x7b, 0x70, 0x72, 0x6f, 0x6d, 0x70,
-  0x74, 0x7d, 0x20, 0x3d, 0x20, 0x73, 0x65, 0x73, 0x73, 0x69, 0x6f, 0x6e,
-  0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x74, 0x72, 0x61, 0x6e, 0x73, 0x63, 0x72, 0x69, 0x70, 0x74,
-  0x55, 0x70, 0x64, 0x61, 0x74, 0x65, 0x28, 0x5b, 0x2e, 0x2e, 0x2e, 0x73,
-  0x65, 0x73, 0x73, 0x69, 0x6f, 0x6e, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65,
-  0x2e, 0x74, 0x72, 0x61, 0x6e, 0x73, 0x63, 0x72, 0x69, 0x70, 0x74, 0x2c,
-  0x20, 0x5b, 0x22, 0x22, 0x2c, 0x20, 0x70, 0x72, 0x6f, 0x6d, 0x70, 0x74,
-  0x5d, 0x5d, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x61,
-  0x77, 0x61, 0x69, 0x74, 0x20, 0x72, 0x75, 0x6e, 0x4c, 0x6c, 0x61, 0x6d,
-  0x61, 0x28, 0x70, 0x72, 0x6f, 0x6d, 0x70, 0x74, 0x2c, 0x20, 0x7b, 0x0a,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2e, 0x2e, 0x2e, 0x70,
-  0x61, 0x72, 0x61, 0x6d, 0x73, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2c,
-  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x73, 0x74, 0x6f,
-  0x70, 0x3a, 0x20, 0x5b, 0x5d, 0x2c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x7d, 0x2c, 0x20, 0x22, 0x22, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20,
-  0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73,
-  0x74, 0x20, 0x73, 0x74, 0x6f, 0x70, 0x20, 0x3d, 0x20, 0x28, 0x65, 0x29,
-  0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x65, 0x2e, 0x70, 0x72, 0x65, 0x76, 0x65, 0x6e, 0x74, 0x44, 0x65, 0x66,
-  0x61, 0x75, 0x6c, 0x74, 0x28, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x63, 0x6f, 0x6e, 0x74, 0x72, 0x6f,
-  0x6c, 0x6c, 0x65, 0x72, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x29, 0x20,
-  0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f,
-  0x6e, 0x74, 0x72, 0x6f, 0x6c, 0x6c, 0x65, 0x72, 0x2e, 0x76, 0x61, 0x6c,
-  0x75, 0x65, 0x2e, 0x61, 0x62, 0x6f, 0x72, 0x74, 0x28, 0x29, 0x3b, 0x0a,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x74,
-  0x72, 0x6f, 0x6c, 0x6c, 0x65, 0x72, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65,
-  0x20, 0x3d, 0x20, 0x6e, 0x75, 0x6c, 0x6c, 0x3b, 0x0a, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a,
-  0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x72, 0x65,
-  0x73, 0x65, 0x74, 0x20, 0x3d, 0x20, 0x28, 0x65, 0x29, 0x20, 0x3d, 0x3e,
-  0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x73, 0x74, 0x6f,
-  0x70, 0x28, 0x65, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x74, 0x72, 0x61, 0x6e, 0x73, 0x63, 0x72, 0x69, 0x70, 0x74, 0x55, 0x70,
-  0x64, 0x61, 0x74, 0x65, 0x28, 0x5b, 0x5d, 0x29, 0x3b, 0x0a, 0x20, 0x20,
-  0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x66, 0x75, 0x6e,
-  0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x4d, 0x65, 0x73, 0x73, 0x61, 0x67,
-  0x65, 0x49, 0x6e, 0x70, 0x75, 0x74, 0x28, 0x29, 0x20, 0x7b, 0x0a, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x6d,
-  0x65, 0x73, 0x73, 0x61, 0x67, 0x65, 0x20, 0x3d, 0x20, 0x75, 0x73, 0x65,
-  0x53, 0x69, 0x67, 0x6e, 0x61, 0x6c, 0x28, 0x22, 0x22, 0x29, 0x0a, 0x0a,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20,
-  0x73, 0x75, 0x62, 0x6d, 0x69, 0x74, 0x20, 0x3d, 0x20, 0x28, 0x65, 0x29,
-  0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x73, 0x74, 0x6f, 0x70, 0x28, 0x65, 0x29, 0x3b, 0x0a, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x68, 0x61, 0x74, 0x28,
-  0x6d, 0x65, 0x73, 0x73, 0x61, 0x67, 0x65, 0x2e, 0x76, 0x61, 0x6c, 0x75,
-  0x65, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x6d, 0x65, 0x73, 0x73, 0x61, 0x67, 0x65, 0x2e, 0x76, 0x61, 0x6c, 0x75,
-  0x65, 0x20, 0x3d, 0x20, 0x22, 0x22, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63,
-  0x6f, 0x6e, 0x73, 0x74, 0x20, 0x65, 0x6e, 0x74, 0x65, 0x72, 0x53, 0x75,
-  0x62, 0x6d, 0x69, 0x74, 0x73, 0x20, 0x3d, 0x20, 0x28, 0x65, 0x76, 0x65,
-  0x6e, 0x74, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x65, 0x76, 0x65,
-  0x6e, 0x74, 0x2e, 0x77, 0x68, 0x69, 0x63, 0x68, 0x20, 0x3d, 0x3d, 0x3d,
-  0x20, 0x31, 0x33, 0x20, 0x26, 0x26, 0x20, 0x21, 0x65, 0x76, 0x65, 0x6e,
-  0x74, 0x2e, 0x73, 0x68, 0x69, 0x66, 0x74, 0x4b, 0x65, 0x79, 0x29, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x73, 0x6c, 0x6f, 0x74, 0x5f, 0x69,
+  0x64, 0x3a, 0x20, 0x73, 0x6c, 0x6f, 0x74, 0x5f, 0x69, 0x64, 0x2c, 0x0a,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x73, 0x74, 0x6f, 0x70,
+  0x3a, 0x20, 0x5b, 0x22, 0x3c, 0x2f, 0x73, 0x3e, 0x22, 0x2c, 0x20, 0x74,
+  0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x28, 0x22, 0x7b, 0x7b, 0x63,
+  0x68, 0x61, 0x72, 0x7d, 0x7d, 0x3a, 0x22, 0x29, 0x2c, 0x20, 0x74, 0x65,
+  0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x28, 0x22, 0x7b, 0x7b, 0x75, 0x73,
+  0x65, 0x72, 0x7d, 0x7d, 0x3a, 0x22, 0x29, 0x5d, 0x2c, 0x0a, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x7d, 0x2c, 0x20, 0x22, 0x7b, 0x7b, 0x63, 0x68,
+  0x61, 0x72, 0x7d, 0x7d, 0x22, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20,
+  0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74,
+  0x20, 0x72, 0x75, 0x6e, 0x43, 0x6f, 0x6d, 0x70, 0x6c, 0x65, 0x74, 0x69,
+  0x6f, 0x6e, 0x20, 0x3d, 0x20, 0x61, 0x73, 0x79, 0x6e, 0x63, 0x20, 0x28,
+  0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x69, 0x66, 0x20, 0x28, 0x63, 0x6f, 0x6e, 0x74, 0x72, 0x6f, 0x6c,
+  0x6c, 0x65, 0x72, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x29, 0x20, 0x7b,
+  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e,
+  0x73, 0x6f, 0x6c, 0x65, 0x2e, 0x6c, 0x6f, 0x67, 0x28, 0x27, 0x61, 0x6c,
+  0x72, 0x65, 0x61, 0x64, 0x79, 0x20, 0x72, 0x75, 0x6e, 0x6e, 0x69, 0x6e,
+  0x67, 0x2e, 0x2e, 0x2e, 0x27, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x3b, 0x0a,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x7b, 0x20, 0x70, 0x72,
+  0x6f, 0x6d, 0x70, 0x74, 0x20, 0x7d, 0x20, 0x3d, 0x20, 0x73, 0x65, 0x73,
+  0x73, 0x69, 0x6f, 0x6e, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x3b, 0x0a,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x74, 0x72, 0x61, 0x6e, 0x73, 0x63,
+  0x72, 0x69, 0x70, 0x74, 0x55, 0x70, 0x64, 0x61, 0x74, 0x65, 0x28, 0x5b,
+  0x2e, 0x2e, 0x2e, 0x73, 0x65, 0x73, 0x73, 0x69, 0x6f, 0x6e, 0x2e, 0x76,
+  0x61, 0x6c, 0x75, 0x65, 0x2e, 0x74, 0x72, 0x61, 0x6e, 0x73, 0x63, 0x72,
+  0x69, 0x70, 0x74, 0x2c, 0x20, 0x5b, 0x22, 0x22, 0x2c, 0x20, 0x70, 0x72,
+  0x6f, 0x6d, 0x70, 0x74, 0x5d, 0x5d, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x61, 0x77, 0x61, 0x69, 0x74, 0x20, 0x72, 0x75, 0x6e,
+  0x4c, 0x6c, 0x61, 0x6d, 0x61, 0x28, 0x70, 0x72, 0x6f, 0x6d, 0x70, 0x74,
+  0x2c, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x2e, 0x2e, 0x2e, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2e, 0x76, 0x61,
+  0x6c, 0x75, 0x65, 0x2c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x73, 0x6c, 0x6f, 0x74, 0x5f, 0x69, 0x64, 0x3a, 0x20, 0x73, 0x6c,
+  0x6f, 0x74, 0x5f, 0x69, 0x64, 0x2c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x73, 0x74, 0x6f, 0x70, 0x3a, 0x20, 0x5b, 0x5d, 0x2c,
+  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x2c, 0x20, 0x22, 0x22,
+  0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20,
+  0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x73, 0x74, 0x6f, 0x70,
+  0x20, 0x3d, 0x20, 0x28, 0x65, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x65, 0x2e, 0x70, 0x72, 0x65, 0x76,
+  0x65, 0x6e, 0x74, 0x44, 0x65, 0x66, 0x61, 0x75, 0x6c, 0x74, 0x28, 0x29,
+  0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28,
+  0x63, 0x6f, 0x6e, 0x74, 0x72, 0x6f, 0x6c, 0x6c, 0x65, 0x72, 0x2e, 0x76,
+  0x61, 0x6c, 0x75, 0x65, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x72, 0x6f, 0x6c, 0x6c,
+  0x65, 0x72, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e, 0x61, 0x62, 0x6f,
+  0x72, 0x74, 0x28, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x72, 0x6f, 0x6c, 0x6c, 0x65, 0x72,
+  0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x3d, 0x20, 0x6e, 0x75, 0x6c,
+  0x6c, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20,
+  0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f,
+  0x6e, 0x73, 0x74, 0x20, 0x72, 0x65, 0x73, 0x65, 0x74, 0x20, 0x3d, 0x20,
+  0x28, 0x65, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x73, 0x74, 0x6f, 0x70, 0x28, 0x65, 0x29, 0x3b, 0x0a,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x74, 0x72, 0x61, 0x6e, 0x73, 0x63,
+  0x72, 0x69, 0x70, 0x74, 0x55, 0x70, 0x64, 0x61, 0x74, 0x65, 0x28, 0x5b,
+  0x5d, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20,
+  0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x75, 0x70, 0x6c,
+  0x6f, 0x61, 0x64, 0x49, 0x6d, 0x61, 0x67, 0x65, 0x20, 0x3d, 0x20, 0x28,
+  0x65, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x65, 0x2e, 0x70, 0x72, 0x65, 0x76, 0x65, 0x6e, 0x74, 0x44,
+  0x65, 0x66, 0x61, 0x75, 0x6c, 0x74, 0x28, 0x29, 0x3b, 0x0a, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x64, 0x6f, 0x63, 0x75, 0x6d, 0x65, 0x6e, 0x74,
+  0x2e, 0x67, 0x65, 0x74, 0x45, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x42,
+  0x79, 0x49, 0x64, 0x28, 0x22, 0x66, 0x69, 0x6c, 0x65, 0x49, 0x6e, 0x70,
+  0x75, 0x74, 0x22, 0x29, 0x2e, 0x63, 0x6c, 0x69, 0x63, 0x6b, 0x28, 0x29,
+  0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x64, 0x6f, 0x63, 0x75,
+  0x6d, 0x65, 0x6e, 0x74, 0x2e, 0x67, 0x65, 0x74, 0x45, 0x6c, 0x65, 0x6d,
+  0x65, 0x6e, 0x74, 0x42, 0x79, 0x49, 0x64, 0x28, 0x22, 0x66, 0x69, 0x6c,
+  0x65, 0x49, 0x6e, 0x70, 0x75, 0x74, 0x22, 0x29, 0x2e, 0x61, 0x64, 0x64,
+  0x45, 0x76, 0x65, 0x6e, 0x74, 0x4c, 0x69, 0x73, 0x74, 0x65, 0x6e, 0x65,
+  0x72, 0x28, 0x22, 0x63, 0x68, 0x61, 0x6e, 0x67, 0x65, 0x22, 0x2c, 0x20,
+  0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x28, 0x65, 0x76,
+  0x65, 0x6e, 0x74, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x73, 0x65, 0x6c,
+  0x65, 0x63, 0x74, 0x65, 0x64, 0x46, 0x69, 0x6c, 0x65, 0x20, 0x3d, 0x20,
+  0x65, 0x76, 0x65, 0x6e, 0x74, 0x2e, 0x74, 0x61, 0x72, 0x67, 0x65, 0x74,
+  0x2e, 0x66, 0x69, 0x6c, 0x65, 0x73, 0x5b, 0x30, 0x5d, 0x3b, 0x0a, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x73,
+  0x65, 0x6c, 0x65, 0x63, 0x74, 0x65, 0x64, 0x46, 0x69, 0x6c, 0x65, 0x29,
+  0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x72, 0x65, 0x61, 0x64, 0x65,
+  0x72, 0x20, 0x3d, 0x20, 0x6e, 0x65, 0x77, 0x20, 0x46, 0x69, 0x6c, 0x65,
+  0x52, 0x65, 0x61, 0x64, 0x65, 0x72, 0x28, 0x29, 0x3b, 0x0a, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x61, 0x64,
+  0x65, 0x72, 0x2e, 0x6f, 0x6e, 0x6c, 0x6f, 0x61, 0x64, 0x20, 0x3d, 0x20,
+  0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x28, 0x29, 0x20,
   0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x73, 0x75, 0x62, 0x6d, 0x69, 0x74, 0x28, 0x65, 0x76, 0x65, 0x6e, 0x74,
-  0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d,
-  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x68,
-  0x74, 0x6d, 0x6c, 0x60, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x3c, 0x66, 0x6f, 0x72, 0x6d, 0x20, 0x6f, 0x6e, 0x73, 0x75, 0x62,
-  0x6d, 0x69, 0x74, 0x3d, 0x24, 0x7b, 0x73, 0x75, 0x62, 0x6d, 0x69, 0x74,
-  0x7d, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x3c, 0x64, 0x69, 0x76, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x74, 0x65, 0x78, 0x74,
-  0x61, 0x72, 0x65, 0x61, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6c, 0x61, 0x73,
-  0x73, 0x4e, 0x61, 0x6d, 0x65, 0x3d, 0x24, 0x7b, 0x67, 0x65, 0x6e, 0x65,
-  0x72, 0x61, 0x74, 0x69, 0x6e, 0x67, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65,
-  0x20, 0x3f, 0x20, 0x22, 0x6c, 0x6f, 0x61, 0x64, 0x69, 0x6e, 0x67, 0x22,
-  0x20, 0x3a, 0x20, 0x6e, 0x75, 0x6c, 0x6c, 0x7d, 0x0a, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x69, 0x6d, 0x61, 0x67,
+  0x65, 0x5f, 0x64, 0x61, 0x74, 0x61, 0x20, 0x3d, 0x20, 0x72, 0x65, 0x61,
+  0x64, 0x65, 0x72, 0x2e, 0x72, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x3b, 0x0a,
   0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x6f, 0x6e, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x3d, 0x24, 0x7b, 0x28, 0x65,
-  0x29, 0x20, 0x3d, 0x3e, 0x20, 0x6d, 0x65, 0x73, 0x73, 0x61, 0x67, 0x65,
-  0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x3d, 0x20, 0x65, 0x2e, 0x74,
-  0x61, 0x72, 0x67, 0x65, 0x74, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x7d,
-  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x6f, 0x6e, 0x6b, 0x65, 0x79, 0x70, 0x72, 0x65,
-  0x73, 0x73, 0x3d, 0x24, 0x7b, 0x65, 0x6e, 0x74, 0x65, 0x72, 0x53, 0x75,
-  0x62, 0x6d, 0x69, 0x74, 0x73, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x70, 0x6c,
-  0x61, 0x63, 0x65, 0x68, 0x6f, 0x6c, 0x64, 0x65, 0x72, 0x3d, 0x22, 0x53,
-  0x61, 0x79, 0x20, 0x73, 0x6f, 0x6d, 0x65, 0x74, 0x68, 0x69, 0x6e, 0x67,
-  0x2e, 0x2e, 0x2e, 0x22, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x6f, 0x77, 0x73,
-  0x3d, 0x32, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x74, 0x79, 0x70, 0x65, 0x3d, 0x22,
-  0x74, 0x65, 0x78, 0x74, 0x22, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x76, 0x61, 0x6c,
-  0x75, 0x65, 0x3d, 0x22, 0x24, 0x7b, 0x6d, 0x65, 0x73, 0x73, 0x61, 0x67,
-  0x65, 0x7d, 0x22, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x2f, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x2f, 0x64, 0x69, 0x76, 0x3e, 0x0a,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x64,
-  0x69, 0x76, 0x20, 0x63, 0x6c, 0x61, 0x73, 0x73, 0x3d, 0x22, 0x72, 0x69,
-  0x67, 0x68, 0x74, 0x22, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x62, 0x75, 0x74, 0x74, 0x6f,
-  0x6e, 0x20, 0x74, 0x79, 0x70, 0x65, 0x3d, 0x22, 0x73, 0x75, 0x62, 0x6d,
-  0x69, 0x74, 0x22, 0x20, 0x64, 0x69, 0x73, 0x61, 0x62, 0x6c, 0x65, 0x64,
+  0x73, 0x65, 0x73, 0x73, 0x69, 0x6f, 0x6e, 0x2e, 0x76, 0x61, 0x6c, 0x75,
+  0x65, 0x20, 0x3d, 0x20, 0x7b, 0x20, 0x2e, 0x2e, 0x2e, 0x73, 0x65, 0x73,
+  0x73, 0x69, 0x6f, 0x6e, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2c, 0x20,
+  0x69, 0x6d, 0x61, 0x67, 0x65, 0x5f, 0x73, 0x65, 0x6c, 0x65, 0x63, 0x74,
+  0x65, 0x64, 0x3a, 0x20, 0x69, 0x6d, 0x61, 0x67, 0x65, 0x5f, 0x64, 0x61,
+  0x74, 0x61, 0x20, 0x7d, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73,
+  0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x3d, 0x20, 0x7b, 0x0a, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x2e, 0x2e, 0x2e, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2e, 0x76,
+  0x61, 0x6c, 0x75, 0x65, 0x2c, 0x20, 0x69, 0x6d, 0x61, 0x67, 0x65, 0x5f,
+  0x64, 0x61, 0x74, 0x61, 0x3a, 0x20, 0x5b, 0x0a, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x7b, 0x20, 0x64, 0x61, 0x74, 0x61, 0x3a, 0x20, 0x69, 0x6d, 0x61, 0x67,
+  0x65, 0x5f, 0x64, 0x61, 0x74, 0x61, 0x2e, 0x72, 0x65, 0x70, 0x6c, 0x61,
+  0x63, 0x65, 0x28, 0x2f, 0x64, 0x61, 0x74, 0x61, 0x3a, 0x69, 0x6d, 0x61,
+  0x67, 0x65, 0x5c, 0x2f, 0x5b, 0x5e, 0x3b, 0x5d, 0x2b, 0x3b, 0x62, 0x61,
+  0x73, 0x65, 0x36, 0x34, 0x2c, 0x2f, 0x2c, 0x20, 0x27, 0x27, 0x29, 0x2c,
+  0x20, 0x69, 0x64, 0x3a, 0x20, 0x31, 0x30, 0x20, 0x7d, 0x5d, 0x0a, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d,
+  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d,
+  0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x73, 0x65, 0x6c, 0x65, 0x63, 0x74, 0x65, 0x64, 0x5f, 0x69, 0x6d, 0x61,
+  0x67, 0x65, 0x20, 0x3d, 0x20, 0x74, 0x72, 0x75, 0x65, 0x3b, 0x0a, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x61,
+  0x64, 0x65, 0x72, 0x2e, 0x72, 0x65, 0x61, 0x64, 0x41, 0x73, 0x44, 0x61,
+  0x74, 0x61, 0x55, 0x52, 0x4c, 0x28, 0x73, 0x65, 0x6c, 0x65, 0x63, 0x74,
+  0x65, 0x64, 0x46, 0x69, 0x6c, 0x65, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x7d, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a,
+  0x20, 0x20, 0x20, 0x20, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e,
+  0x20, 0x4d, 0x65, 0x73, 0x73, 0x61, 0x67, 0x65, 0x49, 0x6e, 0x70, 0x75,
+  0x74, 0x28, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x6d, 0x65, 0x73, 0x73, 0x61, 0x67,
+  0x65, 0x20, 0x3d, 0x20, 0x75, 0x73, 0x65, 0x53, 0x69, 0x67, 0x6e, 0x61,
+  0x6c, 0x28, 0x22, 0x22, 0x29, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x73, 0x75, 0x62, 0x6d, 0x69,
+  0x74, 0x20, 0x3d, 0x20, 0x28, 0x65, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b,
+  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x73, 0x74, 0x6f,
+  0x70, 0x28, 0x65, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x63, 0x68, 0x61, 0x74, 0x28, 0x6d, 0x65, 0x73, 0x73, 0x61,
+  0x67, 0x65, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x29, 0x3b, 0x0a, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6d, 0x65, 0x73, 0x73, 0x61,
+  0x67, 0x65, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x3d, 0x20, 0x22,
+  0x22, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20,
+  0x65, 0x6e, 0x74, 0x65, 0x72, 0x53, 0x75, 0x62, 0x6d, 0x69, 0x74, 0x73,
+  0x20, 0x3d, 0x20, 0x28, 0x65, 0x76, 0x65, 0x6e, 0x74, 0x29, 0x20, 0x3d,
+  0x3e, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x69, 0x66, 0x20, 0x28, 0x65, 0x76, 0x65, 0x6e, 0x74, 0x2e, 0x77, 0x68,
+  0x69, 0x63, 0x68, 0x20, 0x3d, 0x3d, 0x3d, 0x20, 0x31, 0x33, 0x20, 0x26,
+  0x26, 0x20, 0x21, 0x65, 0x76, 0x65, 0x6e, 0x74, 0x2e, 0x73, 0x68, 0x69,
+  0x66, 0x74, 0x4b, 0x65, 0x79, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x73, 0x75, 0x62, 0x6d, 0x69,
+  0x74, 0x28, 0x65, 0x76, 0x65, 0x6e, 0x74, 0x29, 0x3b, 0x0a, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72,
+  0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x68, 0x74, 0x6d, 0x6c, 0x60, 0x0a,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x66, 0x6f, 0x72,
+  0x6d, 0x20, 0x6f, 0x6e, 0x73, 0x75, 0x62, 0x6d, 0x69, 0x74, 0x3d, 0x24,
+  0x7b, 0x73, 0x75, 0x62, 0x6d, 0x69, 0x74, 0x7d, 0x3e, 0x0a, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x64, 0x69, 0x76,
+  0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x3c, 0x74, 0x65, 0x78, 0x74, 0x61, 0x72, 0x65, 0x61, 0x0a,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x63, 0x6c, 0x61, 0x73, 0x73, 0x4e, 0x61, 0x6d, 0x65,
   0x3d, 0x24, 0x7b, 0x67, 0x65, 0x6e, 0x65, 0x72, 0x61, 0x74, 0x69, 0x6e,
-  0x67, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x7d, 0x3e, 0x53, 0x65, 0x6e,
-  0x64, 0x3c, 0x2f, 0x62, 0x75, 0x74, 0x74, 0x6f, 0x6e, 0x3e, 0x0a, 0x20,
+  0x67, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x3f, 0x20, 0x22, 0x6c,
+  0x6f, 0x61, 0x64, 0x69, 0x6e, 0x67, 0x22, 0x20, 0x3a, 0x20, 0x6e, 0x75,
+  0x6c, 0x6c, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6f, 0x6e, 0x69, 0x6e, 0x70,
+  0x75, 0x74, 0x3d, 0x24, 0x7b, 0x28, 0x65, 0x29, 0x20, 0x3d, 0x3e, 0x20,
+  0x6d, 0x65, 0x73, 0x73, 0x61, 0x67, 0x65, 0x2e, 0x76, 0x61, 0x6c, 0x75,
+  0x65, 0x20, 0x3d, 0x20, 0x65, 0x2e, 0x74, 0x61, 0x72, 0x67, 0x65, 0x74,
+  0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6f,
+  0x6e, 0x6b, 0x65, 0x79, 0x70, 0x72, 0x65, 0x73, 0x73, 0x3d, 0x24, 0x7b,
+  0x65, 0x6e, 0x74, 0x65, 0x72, 0x53, 0x75, 0x62, 0x6d, 0x69, 0x74, 0x73,
+  0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x70, 0x6c, 0x61, 0x63, 0x65, 0x68, 0x6f,
+  0x6c, 0x64, 0x65, 0x72, 0x3d, 0x22, 0x53, 0x61, 0x79, 0x20, 0x73, 0x6f,
+  0x6d, 0x65, 0x74, 0x68, 0x69, 0x6e, 0x67, 0x2e, 0x2e, 0x2e, 0x22, 0x0a,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x72, 0x6f, 0x77, 0x73, 0x3d, 0x32, 0x0a, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x74, 0x79, 0x70, 0x65, 0x3d, 0x22, 0x74, 0x65, 0x78, 0x74, 0x22,
+  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x3d, 0x22, 0x24,
+  0x7b, 0x6d, 0x65, 0x73, 0x73, 0x61, 0x67, 0x65, 0x7d, 0x22, 0x0a, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2f,
+  0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x3c, 0x2f, 0x64, 0x69, 0x76, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x64, 0x69, 0x76, 0x20, 0x63, 0x6c,
+  0x61, 0x73, 0x73, 0x3d, 0x22, 0x72, 0x69, 0x67, 0x68, 0x74, 0x22, 0x3e,
+  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x3c, 0x62, 0x75, 0x74, 0x74, 0x6f, 0x6e, 0x20, 0x74, 0x79, 0x70,
+  0x65, 0x3d, 0x22, 0x73, 0x75, 0x62, 0x6d, 0x69, 0x74, 0x22, 0x20, 0x64,
+  0x69, 0x73, 0x61, 0x62, 0x6c, 0x65, 0x64, 0x3d, 0x24, 0x7b, 0x67, 0x65,
+  0x6e, 0x65, 0x72, 0x61, 0x74, 0x69, 0x6e, 0x67, 0x2e, 0x76, 0x61, 0x6c,
+  0x75, 0x65, 0x7d, 0x3e, 0x53, 0x65, 0x6e, 0x64, 0x3c, 0x2f, 0x62, 0x75,
+  0x74, 0x74, 0x6f, 0x6e, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x62, 0x75, 0x74, 0x74, 0x6f,
+  0x6e, 0x20, 0x6f, 0x6e, 0x63, 0x6c, 0x69, 0x63, 0x6b, 0x3d, 0x24, 0x7b,
+  0x75, 0x70, 0x6c, 0x6f, 0x61, 0x64, 0x49, 0x6d, 0x61, 0x67, 0x65, 0x7d,
+  0x3e, 0x55, 0x70, 0x6c, 0x6f, 0x61, 0x64, 0x20, 0x49, 0x6d, 0x61, 0x67,
+  0x65, 0x3c, 0x2f, 0x62, 0x75, 0x74, 0x74, 0x6f, 0x6e, 0x3e, 0x0a, 0x20,
   0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c,
   0x62, 0x75, 0x74, 0x74, 0x6f, 0x6e, 0x20, 0x6f, 0x6e, 0x63, 0x6c, 0x69,
   0x63, 0x6b, 0x3d, 0x24, 0x7b, 0x73, 0x74, 0x6f, 0x70, 0x7d, 0x20, 0x64,
@@ -1238,28 +1380,39 @@ unsigned char index_html[] = {
   0x74, 0x65, 0x78, 0x74, 0x3d, 0x24, 0x7b, 0x74, 0x65, 0x6d, 0x70, 0x6c,
   0x61, 0x74, 0x65, 0x28, 0x74, 0x65, 0x78, 0x74, 0x29, 0x7d, 0x20, 0x2f,
   0x3e, 0x60, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d,
-  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x28,
-  0x75, 0x73, 0x65, 0x72, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20,
+  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20,
+  0x28, 0x75, 0x73, 0x65, 0x72, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72,
+  0x6e, 0x20, 0x68, 0x74, 0x6d, 0x6c, 0x60, 0x3c, 0x70, 0x20, 0x6b, 0x65,
+  0x79, 0x3d, 0x24, 0x7b, 0x69, 0x6e, 0x64, 0x65, 0x78, 0x7d, 0x3e, 0x3c,
+  0x73, 0x74, 0x72, 0x6f, 0x6e, 0x67, 0x3e, 0x24, 0x7b, 0x74, 0x65, 0x6d,
+  0x70, 0x6c, 0x61, 0x74, 0x65, 0x28, 0x75, 0x73, 0x65, 0x72, 0x29, 0x7d,
+  0x3a, 0x3c, 0x2f, 0x73, 0x74, 0x72, 0x6f, 0x6e, 0x67, 0x3e, 0x20, 0x24,
+  0x7b, 0x6d, 0x65, 0x73, 0x73, 0x61, 0x67, 0x65, 0x7d, 0x3c, 0x2f, 0x70,
+  0x3e, 0x60, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d,
+  0x20, 0x65, 0x6c, 0x73, 0x65, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20,
   0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e,
   0x20, 0x68, 0x74, 0x6d, 0x6c, 0x60, 0x3c, 0x70, 0x20, 0x6b, 0x65, 0x79,
-  0x3d, 0x24, 0x7b, 0x69, 0x6e, 0x64, 0x65, 0x78, 0x7d, 0x3e, 0x3c, 0x73,
-  0x74, 0x72, 0x6f, 0x6e, 0x67, 0x3e, 0x24, 0x7b, 0x74, 0x65, 0x6d, 0x70,
-  0x6c, 0x61, 0x74, 0x65, 0x28, 0x75, 0x73, 0x65, 0x72, 0x29, 0x7d, 0x3a,
-  0x3c, 0x2f, 0x73, 0x74, 0x72, 0x6f, 0x6e, 0x67, 0x3e, 0x20, 0x24, 0x7b,
+  0x3d, 0x24, 0x7b, 0x69, 0x6e, 0x64, 0x65, 0x78, 0x7d, 0x3e, 0x24, 0x7b,
   0x6d, 0x65, 0x73, 0x73, 0x61, 0x67, 0x65, 0x7d, 0x3c, 0x2f, 0x70, 0x3e,
-  0x60, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x20,
-  0x65, 0x6c, 0x73, 0x65, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20,
-  0x68, 0x74, 0x6d, 0x6c, 0x60, 0x3c, 0x70, 0x20, 0x6b, 0x65, 0x79, 0x3d,
-  0x24, 0x7b, 0x69, 0x6e, 0x64, 0x65, 0x78, 0x7d, 0x3e, 0x24, 0x7b, 0x6d,
-  0x65, 0x73, 0x73, 0x61, 0x67, 0x65, 0x7d, 0x3c, 0x2f, 0x70, 0x3e, 0x60,
-  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x68, 0x74,
-  0x6d, 0x6c, 0x60, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x3c, 0x73, 0x65, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x69, 0x64, 0x3d,
-  0x22, 0x63, 0x68, 0x61, 0x74, 0x22, 0x20, 0x72, 0x65, 0x66, 0x3d, 0x24,
-  0x7b, 0x63, 0x6f, 0x6e, 0x74, 0x61, 0x69, 0x6e, 0x65, 0x72, 0x7d, 0x3e,
+  0x60, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x3b, 0x0a, 0x0a, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x68,
+  0x74, 0x6d, 0x6c, 0x60, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x3c, 0x73, 0x65, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x69, 0x64,
+  0x3d, 0x22, 0x63, 0x68, 0x61, 0x74, 0x22, 0x20, 0x72, 0x65, 0x66, 0x3d,
+  0x24, 0x7b, 0x63, 0x6f, 0x6e, 0x74, 0x61, 0x69, 0x6e, 0x65, 0x72, 0x7d,
+  0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x3c, 0x69, 0x6d, 0x67, 0x20, 0x73, 0x74, 0x79, 0x6c, 0x65, 0x3d, 0x22,
+  0x77, 0x69, 0x64, 0x74, 0x68, 0x3a, 0x20, 0x36, 0x30, 0x25, 0x3b, 0x24,
+  0x7b, 0x21, 0x73, 0x65, 0x73, 0x73, 0x69, 0x6f, 0x6e, 0x2e, 0x76, 0x61,
+  0x6c, 0x75, 0x65, 0x2e, 0x69, 0x6d, 0x61, 0x67, 0x65, 0x5f, 0x73, 0x65,
+  0x6c, 0x65, 0x63, 0x74, 0x65, 0x64, 0x20, 0x3f, 0x20, 0x60, 0x64, 0x69,
+  0x73, 0x70, 0x6c, 0x61, 0x79, 0x3a, 0x20, 0x6e, 0x6f, 0x6e, 0x65, 0x3b,
+  0x60, 0x20, 0x3a, 0x20, 0x60, 0x60, 0x7d, 0x22, 0x20, 0x73, 0x72, 0x63,
+  0x3d, 0x22, 0x24, 0x7b, 0x73, 0x65, 0x73, 0x73, 0x69, 0x6f, 0x6e, 0x2e,
+  0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e, 0x69, 0x6d, 0x61, 0x67, 0x65, 0x5f,
+  0x73, 0x65, 0x6c, 0x65, 0x63, 0x74, 0x65, 0x64, 0x7d, 0x22, 0x2f, 0x3e,
   0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x24,
   0x7b, 0x6d, 0x65, 0x73, 0x73, 0x61, 0x67, 0x65, 0x73, 0x2e, 0x66, 0x6c,
   0x61, 0x74, 0x4d, 0x61, 0x70, 0x28, 0x63, 0x68, 0x61, 0x74, 0x4c, 0x69,
@@ -1344,703 +1497,704 @@ unsigned char index_html[] = {
   0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
   0x20, 0x2e, 0x72, 0x65, 0x64, 0x75, 0x63, 0x65, 0x28, 0x28, 0x61, 0x63,
   0x63, 0x2c, 0x20, 0x63, 0x75, 0x72, 0x2c, 0x20, 0x69, 0x29, 0x20, 0x3d,
-  0x3e, 0x20, 0x28, 0x7b, 0x2e, 0x2e, 0x2e, 0x61, 0x63, 0x63, 0x2c, 0x20,
-  0x5b, 0x63, 0x75, 0x72, 0x2e, 0x74, 0x72, 0x69, 0x6d, 0x28, 0x29, 0x5d,
-  0x3a, 0x20, 0x69, 0x7d, 0x29, 0x2c, 0x20, 0x7b, 0x7d, 0x29, 0x0a, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x29, 0x0a, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e,
-  0x76, 0x65, 0x72, 0x74, 0x65, 0x72, 0x2e, 0x76, 0x69, 0x73, 0x69, 0x74,
-  0x28, 0x73, 0x63, 0x68, 0x65, 0x6d, 0x61, 0x2c, 0x20, 0x27, 0x27, 0x29,
-  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x70,
-  0x61, 0x72, 0x61, 0x6d, 0x73, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20,
-  0x3d, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x2e, 0x2e, 0x2e, 0x70, 0x61, 0x72, 0x61, 0x6d,
-  0x73, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2c, 0x0a, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x67, 0x72, 0x61,
-  0x6d, 0x6d, 0x61, 0x72, 0x3a, 0x20, 0x63, 0x6f, 0x6e, 0x76, 0x65, 0x72,
-  0x74, 0x65, 0x72, 0x2e, 0x66, 0x6f, 0x72, 0x6d, 0x61, 0x74, 0x47, 0x72,
-  0x61, 0x6d, 0x6d, 0x61, 0x72, 0x28, 0x29, 0x2c, 0x0a, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x20, 0x63, 0x61, 0x74, 0x63, 0x68,
-  0x20, 0x28, 0x65, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x61, 0x6c, 0x65, 0x72, 0x74, 0x28, 0x60,
-  0x43, 0x6f, 0x6e, 0x76, 0x65, 0x72, 0x74, 0x20, 0x66, 0x61, 0x69, 0x6c,
-  0x65, 0x64, 0x3a, 0x20, 0x24, 0x7b, 0x65, 0x2e, 0x6d, 0x65, 0x73, 0x73,
-  0x61, 0x67, 0x65, 0x7d, 0x60, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d,
-  0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73,
-  0x74, 0x20, 0x46, 0x6c, 0x6f, 0x61, 0x74, 0x46, 0x69, 0x65, 0x6c, 0x64,
-  0x20, 0x3d, 0x20, 0x28, 0x7b, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x2c, 0x20,
-  0x6d, 0x61, 0x78, 0x2c, 0x20, 0x6d, 0x69, 0x6e, 0x2c, 0x20, 0x6e, 0x61,
-  0x6d, 0x65, 0x2c, 0x20, 0x73, 0x74, 0x65, 0x70, 0x2c, 0x20, 0x76, 0x61,
-  0x6c, 0x75, 0x65, 0x7d, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72,
-  0x6e, 0x20, 0x68, 0x74, 0x6d, 0x6c, 0x60, 0x0a, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x64, 0x69, 0x76, 0x3e, 0x0a,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x3c, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x20, 0x66, 0x6f, 0x72, 0x3d, 0x22,
-  0x24, 0x7b, 0x6e, 0x61, 0x6d, 0x65, 0x7d, 0x22, 0x3e, 0x24, 0x7b, 0x6c,
-  0x61, 0x62, 0x65, 0x6c, 0x7d, 0x3c, 0x2f, 0x6c, 0x61, 0x62, 0x65, 0x6c,
-  0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x3c, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x20, 0x74, 0x79, 0x70,
-  0x65, 0x3d, 0x22, 0x72, 0x61, 0x6e, 0x67, 0x65, 0x22, 0x20, 0x69, 0x64,
-  0x3d, 0x22, 0x24, 0x7b, 0x6e, 0x61, 0x6d, 0x65, 0x7d, 0x22, 0x20, 0x6d,
-  0x69, 0x6e, 0x3d, 0x22, 0x24, 0x7b, 0x6d, 0x69, 0x6e, 0x7d, 0x22, 0x20,
-  0x6d, 0x61, 0x78, 0x3d, 0x22, 0x24, 0x7b, 0x6d, 0x61, 0x78, 0x7d, 0x22,
-  0x20, 0x73, 0x74, 0x65, 0x70, 0x3d, 0x22, 0x24, 0x7b, 0x73, 0x74, 0x65,
-  0x70, 0x7d, 0x22, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x3d, 0x22, 0x24, 0x7b,
-  0x6e, 0x61, 0x6d, 0x65, 0x7d, 0x22, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65,
-  0x3d, 0x22, 0x24, 0x7b, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x7d, 0x22, 0x20,
-  0x6f, 0x6e, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x3d, 0x24, 0x7b, 0x75, 0x70,
-  0x64, 0x61, 0x74, 0x65, 0x50, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x46, 0x6c,
-  0x6f, 0x61, 0x74, 0x7d, 0x20, 0x2f, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x73, 0x70, 0x61,
-  0x6e, 0x3e, 0x24, 0x7b, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x7d, 0x3c, 0x2f,
-  0x73, 0x70, 0x61, 0x6e, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x3c, 0x2f, 0x64, 0x69, 0x76, 0x3e, 0x0a, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x60, 0x0a, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x7d, 0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x49, 0x6e, 0x74, 0x46, 0x69,
-  0x65, 0x6c, 0x64, 0x20, 0x3d, 0x20, 0x28, 0x7b, 0x6c, 0x61, 0x62, 0x65,
+  0x3e, 0x20, 0x28, 0x7b, 0x20, 0x2e, 0x2e, 0x2e, 0x61, 0x63, 0x63, 0x2c,
+  0x20, 0x5b, 0x63, 0x75, 0x72, 0x2e, 0x74, 0x72, 0x69, 0x6d, 0x28, 0x29,
+  0x5d, 0x3a, 0x20, 0x69, 0x20, 0x7d, 0x29, 0x2c, 0x20, 0x7b, 0x7d, 0x29,
+  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x29,
+  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63,
+  0x6f, 0x6e, 0x76, 0x65, 0x72, 0x74, 0x65, 0x72, 0x2e, 0x76, 0x69, 0x73,
+  0x69, 0x74, 0x28, 0x73, 0x63, 0x68, 0x65, 0x6d, 0x61, 0x2c, 0x20, 0x27,
+  0x27, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2e, 0x76, 0x61, 0x6c, 0x75,
+  0x65, 0x20, 0x3d, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2e, 0x2e, 0x2e, 0x70, 0x61, 0x72,
+  0x61, 0x6d, 0x73, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2c, 0x0a, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x67,
+  0x72, 0x61, 0x6d, 0x6d, 0x61, 0x72, 0x3a, 0x20, 0x63, 0x6f, 0x6e, 0x76,
+  0x65, 0x72, 0x74, 0x65, 0x72, 0x2e, 0x66, 0x6f, 0x72, 0x6d, 0x61, 0x74,
+  0x47, 0x72, 0x61, 0x6d, 0x6d, 0x61, 0x72, 0x28, 0x29, 0x2c, 0x0a, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x20, 0x63, 0x61, 0x74,
+  0x63, 0x68, 0x20, 0x28, 0x65, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x61, 0x6c, 0x65, 0x72, 0x74,
+  0x28, 0x60, 0x43, 0x6f, 0x6e, 0x76, 0x65, 0x72, 0x74, 0x20, 0x66, 0x61,
+  0x69, 0x6c, 0x65, 0x64, 0x3a, 0x20, 0x24, 0x7b, 0x65, 0x2e, 0x6d, 0x65,
+  0x73, 0x73, 0x61, 0x67, 0x65, 0x7d, 0x60, 0x29, 0x0a, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f,
+  0x6e, 0x73, 0x74, 0x20, 0x46, 0x6c, 0x6f, 0x61, 0x74, 0x46, 0x69, 0x65,
+  0x6c, 0x64, 0x20, 0x3d, 0x20, 0x28, 0x7b, 0x20, 0x6c, 0x61, 0x62, 0x65,
   0x6c, 0x2c, 0x20, 0x6d, 0x61, 0x78, 0x2c, 0x20, 0x6d, 0x69, 0x6e, 0x2c,
-  0x20, 0x6e, 0x61, 0x6d, 0x65, 0x2c, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65,
-  0x7d, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x68,
-  0x74, 0x6d, 0x6c, 0x60, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x3c, 0x64, 0x69, 0x76, 0x3e, 0x0a, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x6c, 0x61,
-  0x62, 0x65, 0x6c, 0x20, 0x66, 0x6f, 0x72, 0x3d, 0x22, 0x24, 0x7b, 0x6e,
-  0x61, 0x6d, 0x65, 0x7d, 0x22, 0x3e, 0x24, 0x7b, 0x6c, 0x61, 0x62, 0x65,
-  0x6c, 0x7d, 0x3c, 0x2f, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x3e, 0x0a, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c,
-  0x69, 0x6e, 0x70, 0x75, 0x74, 0x20, 0x74, 0x79, 0x70, 0x65, 0x3d, 0x22,
-  0x72, 0x61, 0x6e, 0x67, 0x65, 0x22, 0x20, 0x69, 0x64, 0x3d, 0x22, 0x24,
-  0x7b, 0x6e, 0x61, 0x6d, 0x65, 0x7d, 0x22, 0x20, 0x6d, 0x69, 0x6e, 0x3d,
-  0x22, 0x24, 0x7b, 0x6d, 0x69, 0x6e, 0x7d, 0x22, 0x20, 0x6d, 0x61, 0x78,
-  0x3d, 0x22, 0x24, 0x7b, 0x6d, 0x61, 0x78, 0x7d, 0x22, 0x20, 0x6e, 0x61,
-  0x6d, 0x65, 0x3d, 0x22, 0x24, 0x7b, 0x6e, 0x61, 0x6d, 0x65, 0x7d, 0x22,
-  0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x3d, 0x22, 0x24, 0x7b, 0x76, 0x61,
-  0x6c, 0x75, 0x65, 0x7d, 0x22, 0x20, 0x6f, 0x6e, 0x69, 0x6e, 0x70, 0x75,
-  0x74, 0x3d, 0x24, 0x7b, 0x75, 0x70, 0x64, 0x61, 0x74, 0x65, 0x50, 0x61,
-  0x72, 0x61, 0x6d, 0x73, 0x49, 0x6e, 0x74, 0x7d, 0x20, 0x2f, 0x3e, 0x0a,
+  0x20, 0x6e, 0x61, 0x6d, 0x65, 0x2c, 0x20, 0x73, 0x74, 0x65, 0x70, 0x2c,
+  0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x7d, 0x29, 0x20, 0x3d, 0x3e,
+  0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72,
+  0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x68, 0x74, 0x6d, 0x6c, 0x60, 0x0a,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x64,
+  0x69, 0x76, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x3c, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x20, 0x66,
+  0x6f, 0x72, 0x3d, 0x22, 0x24, 0x7b, 0x6e, 0x61, 0x6d, 0x65, 0x7d, 0x22,
+  0x3e, 0x24, 0x7b, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x7d, 0x3c, 0x2f, 0x6c,
+  0x61, 0x62, 0x65, 0x6c, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x69, 0x6e, 0x70, 0x75, 0x74,
+  0x20, 0x74, 0x79, 0x70, 0x65, 0x3d, 0x22, 0x72, 0x61, 0x6e, 0x67, 0x65,
+  0x22, 0x20, 0x69, 0x64, 0x3d, 0x22, 0x24, 0x7b, 0x6e, 0x61, 0x6d, 0x65,
+  0x7d, 0x22, 0x20, 0x6d, 0x69, 0x6e, 0x3d, 0x22, 0x24, 0x7b, 0x6d, 0x69,
+  0x6e, 0x7d, 0x22, 0x20, 0x6d, 0x61, 0x78, 0x3d, 0x22, 0x24, 0x7b, 0x6d,
+  0x61, 0x78, 0x7d, 0x22, 0x20, 0x73, 0x74, 0x65, 0x70, 0x3d, 0x22, 0x24,
+  0x7b, 0x73, 0x74, 0x65, 0x70, 0x7d, 0x22, 0x20, 0x6e, 0x61, 0x6d, 0x65,
+  0x3d, 0x22, 0x24, 0x7b, 0x6e, 0x61, 0x6d, 0x65, 0x7d, 0x22, 0x20, 0x76,
+  0x61, 0x6c, 0x75, 0x65, 0x3d, 0x22, 0x24, 0x7b, 0x76, 0x61, 0x6c, 0x75,
+  0x65, 0x7d, 0x22, 0x20, 0x6f, 0x6e, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x3d,
+  0x24, 0x7b, 0x75, 0x70, 0x64, 0x61, 0x74, 0x65, 0x50, 0x61, 0x72, 0x61,
+  0x6d, 0x73, 0x46, 0x6c, 0x6f, 0x61, 0x74, 0x7d, 0x20, 0x2f, 0x3e, 0x0a,
   0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
   0x3c, 0x73, 0x70, 0x61, 0x6e, 0x3e, 0x24, 0x7b, 0x76, 0x61, 0x6c, 0x75,
   0x65, 0x7d, 0x3c, 0x2f, 0x73, 0x70, 0x61, 0x6e, 0x3e, 0x0a, 0x20, 0x20,
   0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x2f, 0x64, 0x69,
   0x76, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x60,
   0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x3b, 0x0a, 0x0a, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x75,
-  0x73, 0x65, 0x72, 0x54, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x52,
-  0x65, 0x73, 0x65, 0x74, 0x20, 0x3d, 0x20, 0x28, 0x65, 0x29, 0x20, 0x3d,
-  0x3e, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x65, 0x2e, 0x70, 0x72, 0x65, 0x76, 0x65, 0x6e, 0x74, 0x44, 0x65, 0x66,
-  0x61, 0x75, 0x6c, 0x74, 0x28, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x75, 0x73, 0x65, 0x72, 0x54, 0x65, 0x6d, 0x70,
-  0x6c, 0x61, 0x74, 0x65, 0x52, 0x65, 0x73, 0x65, 0x74, 0x54, 0x6f, 0x44,
-  0x65, 0x66, 0x61, 0x75, 0x6c, 0x74, 0x41, 0x6e, 0x64, 0x41, 0x70, 0x70,
-  0x6c, 0x79, 0x28, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d,
-  0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73,
-  0x74, 0x20, 0x55, 0x73, 0x65, 0x72, 0x54, 0x65, 0x6d, 0x70, 0x6c, 0x61,
-  0x74, 0x65, 0x52, 0x65, 0x73, 0x65, 0x74, 0x42, 0x75, 0x74, 0x74, 0x6f,
-  0x6e, 0x20, 0x3d, 0x20, 0x28, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28,
-  0x73, 0x65, 0x6c, 0x65, 0x63, 0x74, 0x65, 0x64, 0x55, 0x73, 0x65, 0x72,
-  0x54, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x2e, 0x76, 0x61, 0x6c,
-  0x75, 0x65, 0x2e, 0x6e, 0x61, 0x6d, 0x65, 0x20, 0x3d, 0x3d, 0x20, 0x27,
-  0x64, 0x65, 0x66, 0x61, 0x75, 0x6c, 0x74, 0x27, 0x29, 0x20, 0x7b, 0x0a,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x49,
+  0x6e, 0x74, 0x46, 0x69, 0x65, 0x6c, 0x64, 0x20, 0x3d, 0x20, 0x28, 0x7b,
+  0x20, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x2c, 0x20, 0x6d, 0x61, 0x78, 0x2c,
+  0x20, 0x6d, 0x69, 0x6e, 0x2c, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x2c, 0x20,
+  0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x7d, 0x29, 0x20, 0x3d, 0x3e, 0x20,
+  0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65,
   0x74, 0x75, 0x72, 0x6e, 0x20, 0x68, 0x74, 0x6d, 0x6c, 0x60, 0x0a, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c,
-  0x62, 0x75, 0x74, 0x74, 0x6f, 0x6e, 0x20, 0x64, 0x69, 0x73, 0x61, 0x62,
-  0x6c, 0x65, 0x64, 0x3e, 0x55, 0x73, 0x69, 0x6e, 0x67, 0x20, 0x64, 0x65,
-  0x66, 0x61, 0x75, 0x6c, 0x74, 0x20, 0x74, 0x65, 0x6d, 0x70, 0x6c, 0x61,
-  0x74, 0x65, 0x3c, 0x2f, 0x62, 0x75, 0x74, 0x74, 0x6f, 0x6e, 0x3e, 0x0a,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x60, 0x0a,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72,
-  0x6e, 0x20, 0x68, 0x74, 0x6d, 0x6c, 0x60, 0x0a, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x62, 0x75, 0x74, 0x74, 0x6f,
-  0x6e, 0x20, 0x6f, 0x6e, 0x63, 0x6c, 0x69, 0x63, 0x6b, 0x3d, 0x24, 0x7b,
-  0x75, 0x73, 0x65, 0x72, 0x54, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65,
-  0x52, 0x65, 0x73, 0x65, 0x74, 0x7d, 0x3e, 0x52, 0x65, 0x73, 0x65, 0x74,
-  0x20, 0x61, 0x6c, 0x6c, 0x20, 0x74, 0x6f, 0x20, 0x64, 0x65, 0x66, 0x61,
-  0x75, 0x6c, 0x74, 0x3c, 0x2f, 0x62, 0x75, 0x74, 0x74, 0x6f, 0x6e, 0x3e,
-  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x60, 0x0a, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x75, 0x73, 0x65, 0x45, 0x66, 0x66, 0x65, 0x63, 0x74,
-  0x28, 0x28, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f, 0x20, 0x61, 0x75, 0x74, 0x6f,
-  0x73, 0x61, 0x76, 0x65, 0x20, 0x74, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74,
-  0x65, 0x20, 0x6f, 0x6e, 0x20, 0x65, 0x76, 0x65, 0x72, 0x79, 0x20, 0x63,
-  0x68, 0x61, 0x6e, 0x67, 0x65, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x75, 0x73, 0x65, 0x72, 0x54, 0x65, 0x6d, 0x70, 0x6c, 0x61,
-  0x74, 0x65, 0x41, 0x75, 0x74, 0x6f, 0x73, 0x61, 0x76, 0x65, 0x28, 0x29,
-  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x2c, 0x20, 0x5b, 0x73,
-  0x65, 0x73, 0x73, 0x69, 0x6f, 0x6e, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65,
-  0x2c, 0x20, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2e, 0x76, 0x61, 0x6c,
-  0x75, 0x65, 0x5d, 0x29, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x47, 0x72, 0x61, 0x6d, 0x6d, 0x61,
-  0x72, 0x43, 0x6f, 0x6e, 0x74, 0x72, 0x6f, 0x6c, 0x20, 0x3d, 0x20, 0x28,
-  0x29, 0x20, 0x3d, 0x3e, 0x20, 0x28, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x68, 0x74, 0x6d, 0x6c, 0x60, 0x0a, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x64, 0x69, 0x76, 0x3e,
-  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x3c, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x20, 0x66, 0x6f, 0x72, 0x3d,
-  0x22, 0x74, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x22, 0x3e, 0x47,
-  0x72, 0x61, 0x6d, 0x6d, 0x61, 0x72, 0x3c, 0x2f, 0x6c, 0x61, 0x62, 0x65,
-  0x6c, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x3c, 0x74, 0x65, 0x78, 0x74, 0x61, 0x72, 0x65, 0x61,
-  0x20, 0x69, 0x64, 0x3d, 0x22, 0x67, 0x72, 0x61, 0x6d, 0x6d, 0x61, 0x72,
-  0x22, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x3d, 0x22, 0x67, 0x72, 0x61, 0x6d,
-  0x6d, 0x61, 0x72, 0x22, 0x20, 0x70, 0x6c, 0x61, 0x63, 0x65, 0x68, 0x6f,
-  0x6c, 0x64, 0x65, 0x72, 0x3d, 0x22, 0x55, 0x73, 0x65, 0x20, 0x67, 0x62,
-  0x6e, 0x66, 0x20, 0x6f, 0x72, 0x20, 0x4a, 0x53, 0x4f, 0x4e, 0x20, 0x53,
-  0x63, 0x68, 0x65, 0x6d, 0x61, 0x2b, 0x63, 0x6f, 0x6e, 0x76, 0x65, 0x72,
-  0x74, 0x22, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x3d, 0x22, 0x24, 0x7b,
-  0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65,
-  0x2e, 0x67, 0x72, 0x61, 0x6d, 0x6d, 0x61, 0x72, 0x7d, 0x22, 0x20, 0x72,
-  0x6f, 0x77, 0x73, 0x3d, 0x34, 0x20, 0x6f, 0x6e, 0x69, 0x6e, 0x70, 0x75,
-  0x74, 0x3d, 0x24, 0x7b, 0x75, 0x70, 0x64, 0x61, 0x74, 0x65, 0x50, 0x61,
-  0x72, 0x61, 0x6d, 0x73, 0x7d, 0x2f, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x69, 0x6e, 0x70,
-  0x75, 0x74, 0x20, 0x74, 0x79, 0x70, 0x65, 0x3d, 0x22, 0x74, 0x65, 0x78,
-  0x74, 0x22, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x3d, 0x22, 0x70, 0x72, 0x6f,
-  0x70, 0x2d, 0x6f, 0x72, 0x64, 0x65, 0x72, 0x22, 0x20, 0x70, 0x6c, 0x61,
-  0x63, 0x65, 0x68, 0x6f, 0x6c, 0x64, 0x65, 0x72, 0x3d, 0x22, 0x6f, 0x72,
-  0x64, 0x65, 0x72, 0x3a, 0x20, 0x70, 0x72, 0x6f, 0x70, 0x31, 0x2c, 0x70,
-  0x72, 0x6f, 0x70, 0x32, 0x2c, 0x70, 0x72, 0x6f, 0x70, 0x33, 0x22, 0x20,
-  0x6f, 0x6e, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x3d, 0x24, 0x7b, 0x75, 0x70,
-  0x64, 0x61, 0x74, 0x65, 0x47, 0x72, 0x61, 0x6d, 0x6d, 0x61, 0x72, 0x4a,
-  0x73, 0x6f, 0x6e, 0x53, 0x63, 0x68, 0x65, 0x6d, 0x61, 0x50, 0x72, 0x6f,
-  0x70, 0x4f, 0x72, 0x64, 0x65, 0x72, 0x7d, 0x20, 0x2f, 0x3e, 0x0a, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c,
-  0x62, 0x75, 0x74, 0x74, 0x6f, 0x6e, 0x20, 0x74, 0x79, 0x70, 0x65, 0x3d,
-  0x22, 0x62, 0x75, 0x74, 0x74, 0x6f, 0x6e, 0x22, 0x20, 0x6f, 0x6e, 0x63,
-  0x6c, 0x69, 0x63, 0x6b, 0x3d, 0x24, 0x7b, 0x63, 0x6f, 0x6e, 0x76, 0x65,
-  0x72, 0x74, 0x4a, 0x53, 0x4f, 0x4e, 0x53, 0x63, 0x68, 0x65, 0x6d, 0x61,
-  0x47, 0x72, 0x61, 0x6d, 0x6d, 0x61, 0x72, 0x7d, 0x3e, 0x43, 0x6f, 0x6e,
-  0x76, 0x65, 0x72, 0x74, 0x20, 0x4a, 0x53, 0x4f, 0x4e, 0x20, 0x53, 0x63,
-  0x68, 0x65, 0x6d, 0x61, 0x3c, 0x2f, 0x62, 0x75, 0x74, 0x74, 0x6f, 0x6e,
-  0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x3c, 0x2f, 0x64, 0x69, 0x76, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x60, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x29, 0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63,
-  0x6f, 0x6e, 0x73, 0x74, 0x20, 0x50, 0x72, 0x6f, 0x6d, 0x70, 0x74, 0x43,
-  0x6f, 0x6e, 0x74, 0x72, 0x6f, 0x6c, 0x46, 0x69, 0x65, 0x6c, 0x64, 0x53,
-  0x65, 0x74, 0x20, 0x3d, 0x20, 0x28, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x28,
-  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x68, 0x74, 0x6d,
-  0x6c, 0x60, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c,
-  0x66, 0x69, 0x65, 0x6c, 0x64, 0x73, 0x65, 0x74, 0x3e, 0x0a, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x64, 0x69, 0x76,
-  0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x3c, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x20, 0x68, 0x74, 0x6d,
-  0x6c, 0x46, 0x6f, 0x72, 0x3d, 0x22, 0x70, 0x72, 0x6f, 0x6d, 0x70, 0x74,
-  0x22, 0x3e, 0x50, 0x72, 0x6f, 0x6d, 0x70, 0x74, 0x3c, 0x2f, 0x6c, 0x61,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x64, 0x69,
+  0x76, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x3c, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x20, 0x66, 0x6f,
+  0x72, 0x3d, 0x22, 0x24, 0x7b, 0x6e, 0x61, 0x6d, 0x65, 0x7d, 0x22, 0x3e,
+  0x24, 0x7b, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x7d, 0x3c, 0x2f, 0x6c, 0x61,
   0x62, 0x65, 0x6c, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x74, 0x65, 0x78, 0x74, 0x61, 0x72,
-  0x65, 0x61, 0x20, 0x74, 0x79, 0x70, 0x65, 0x3d, 0x22, 0x74, 0x65, 0x78,
-  0x74, 0x22, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x3d, 0x22, 0x70, 0x72, 0x6f,
-  0x6d, 0x70, 0x74, 0x22, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x3d, 0x22,
-  0x24, 0x7b, 0x73, 0x65, 0x73, 0x73, 0x69, 0x6f, 0x6e, 0x2e, 0x76, 0x61,
-  0x6c, 0x75, 0x65, 0x2e, 0x70, 0x72, 0x6f, 0x6d, 0x70, 0x74, 0x7d, 0x22,
-  0x20, 0x6f, 0x6e, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x3d, 0x24, 0x7b, 0x75,
-  0x70, 0x64, 0x61, 0x74, 0x65, 0x53, 0x65, 0x73, 0x73, 0x69, 0x6f, 0x6e,
-  0x7d, 0x2f, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x3c, 0x2f, 0x64, 0x69, 0x76, 0x3e, 0x0a, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x2f, 0x66, 0x69, 0x65, 0x6c, 0x64,
-  0x73, 0x65, 0x74, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x60, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x29, 0x3b, 0x0a,
-  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74,
-  0x20, 0x43, 0x68, 0x61, 0x74, 0x43, 0x6f, 0x6e, 0x66, 0x69, 0x67, 0x46,
-  0x6f, 0x72, 0x6d, 0x20, 0x3d, 0x20, 0x28, 0x29, 0x20, 0x3d, 0x3e, 0x20,
-  0x28, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x68, 0x74,
-  0x6d, 0x6c, 0x60, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x24, 0x7b, 0x50, 0x72, 0x6f, 0x6d, 0x70, 0x74, 0x43, 0x6f,
-  0x6e, 0x74, 0x72, 0x6f, 0x6c, 0x46, 0x69, 0x65, 0x6c, 0x64, 0x53, 0x65,
-  0x74, 0x28, 0x29, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x3c, 0x66, 0x69, 0x65, 0x6c, 0x64, 0x73, 0x65,
-  0x74, 0x20, 0x63, 0x6c, 0x61, 0x73, 0x73, 0x3d, 0x22, 0x74, 0x77, 0x6f,
-  0x22, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x3c, 0x64, 0x69, 0x76, 0x3e, 0x0a, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c,
-  0x6c, 0x61, 0x62, 0x65, 0x6c, 0x20, 0x66, 0x6f, 0x72, 0x3d, 0x22, 0x75,
-  0x73, 0x65, 0x72, 0x22, 0x3e, 0x55, 0x73, 0x65, 0x72, 0x20, 0x6e, 0x61,
-  0x6d, 0x65, 0x3c, 0x2f, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x3e, 0x0a, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x3c, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x20, 0x74, 0x79, 0x70, 0x65,
-  0x3d, 0x22, 0x74, 0x65, 0x78, 0x74, 0x22, 0x20, 0x6e, 0x61, 0x6d, 0x65,
-  0x3d, 0x22, 0x75, 0x73, 0x65, 0x72, 0x22, 0x20, 0x76, 0x61, 0x6c, 0x75,
-  0x65, 0x3d, 0x22, 0x24, 0x7b, 0x73, 0x65, 0x73, 0x73, 0x69, 0x6f, 0x6e,
-  0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e, 0x75, 0x73, 0x65, 0x72, 0x7d,
-  0x22, 0x20, 0x6f, 0x6e, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x3d, 0x24, 0x7b,
-  0x75, 0x70, 0x64, 0x61, 0x74, 0x65, 0x53, 0x65, 0x73, 0x73, 0x69, 0x6f,
-  0x6e, 0x7d, 0x20, 0x2f, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x2f, 0x64, 0x69, 0x76, 0x3e,
-  0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x3c, 0x64, 0x69, 0x76, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x6c,
-  0x61, 0x62, 0x65, 0x6c, 0x20, 0x66, 0x6f, 0x72, 0x3d, 0x22, 0x62, 0x6f,
-  0x74, 0x22, 0x3e, 0x42, 0x6f, 0x74, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x3c,
-  0x2f, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x69,
-  0x6e, 0x70, 0x75, 0x74, 0x20, 0x74, 0x79, 0x70, 0x65, 0x3d, 0x22, 0x74,
-  0x65, 0x78, 0x74, 0x22, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x3d, 0x22, 0x63,
-  0x68, 0x61, 0x72, 0x22, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x3d, 0x22,
-  0x24, 0x7b, 0x73, 0x65, 0x73, 0x73, 0x69, 0x6f, 0x6e, 0x2e, 0x76, 0x61,
-  0x6c, 0x75, 0x65, 0x2e, 0x63, 0x68, 0x61, 0x72, 0x7d, 0x22, 0x20, 0x6f,
-  0x6e, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x3d, 0x24, 0x7b, 0x75, 0x70, 0x64,
-  0x61, 0x74, 0x65, 0x53, 0x65, 0x73, 0x73, 0x69, 0x6f, 0x6e, 0x7d, 0x20,
-  0x2f, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x3c, 0x2f, 0x64, 0x69, 0x76, 0x3e, 0x0a, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x2f, 0x66, 0x69,
-  0x65, 0x6c, 0x64, 0x73, 0x65, 0x74, 0x3e, 0x0a, 0x0a, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x66, 0x69, 0x65, 0x6c,
-  0x64, 0x73, 0x65, 0x74, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x64, 0x69, 0x76, 0x3e, 0x0a,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x3c, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x20, 0x66, 0x6f, 0x72,
-  0x3d, 0x22, 0x74, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x22, 0x3e,
-  0x50, 0x72, 0x6f, 0x6d, 0x70, 0x74, 0x20, 0x74, 0x65, 0x6d, 0x70, 0x6c,
-  0x61, 0x74, 0x65, 0x3c, 0x2f, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x3e, 0x0a,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x3c, 0x74, 0x65, 0x78, 0x74, 0x61, 0x72, 0x65, 0x61, 0x20,
-  0x69, 0x64, 0x3d, 0x22, 0x74, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65,
-  0x22, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x3d, 0x22, 0x74, 0x65, 0x6d, 0x70,
-  0x6c, 0x61, 0x74, 0x65, 0x22, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x3d,
-  0x22, 0x24, 0x7b, 0x73, 0x65, 0x73, 0x73, 0x69, 0x6f, 0x6e, 0x2e, 0x76,
-  0x61, 0x6c, 0x75, 0x65, 0x2e, 0x74, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74,
-  0x65, 0x7d, 0x22, 0x20, 0x72, 0x6f, 0x77, 0x73, 0x3d, 0x34, 0x20, 0x6f,
-  0x6e, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x3d, 0x24, 0x7b, 0x75, 0x70, 0x64,
-  0x61, 0x74, 0x65, 0x53, 0x65, 0x73, 0x73, 0x69, 0x6f, 0x6e, 0x7d, 0x2f,
-  0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x3c, 0x2f, 0x64, 0x69, 0x76, 0x3e, 0x0a, 0x0a, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x64,
-  0x69, 0x76, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x6c, 0x61, 0x62, 0x65, 0x6c,
-  0x20, 0x66, 0x6f, 0x72, 0x3d, 0x22, 0x74, 0x65, 0x6d, 0x70, 0x6c, 0x61,
-  0x74, 0x65, 0x22, 0x3e, 0x43, 0x68, 0x61, 0x74, 0x20, 0x68, 0x69, 0x73,
-  0x74, 0x6f, 0x72, 0x79, 0x20, 0x74, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74,
-  0x65, 0x3c, 0x2f, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x3e, 0x0a, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x3c, 0x74, 0x65, 0x78, 0x74, 0x61, 0x72, 0x65, 0x61, 0x20, 0x69, 0x64,
-  0x3d, 0x22, 0x74, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x22, 0x20,
-  0x6e, 0x61, 0x6d, 0x65, 0x3d, 0x22, 0x68, 0x69, 0x73, 0x74, 0x6f, 0x72,
-  0x79, 0x54, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x22, 0x20, 0x76,
-  0x61, 0x6c, 0x75, 0x65, 0x3d, 0x22, 0x24, 0x7b, 0x73, 0x65, 0x73, 0x73,
-  0x69, 0x6f, 0x6e, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e, 0x68, 0x69,
-  0x73, 0x74, 0x6f, 0x72, 0x79, 0x54, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74,
-  0x65, 0x7d, 0x22, 0x20, 0x72, 0x6f, 0x77, 0x73, 0x3d, 0x31, 0x20, 0x6f,
-  0x6e, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x3d, 0x24, 0x7b, 0x75, 0x70, 0x64,
-  0x61, 0x74, 0x65, 0x53, 0x65, 0x73, 0x73, 0x69, 0x6f, 0x6e, 0x7d, 0x2f,
-  0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x3c, 0x2f, 0x64, 0x69, 0x76, 0x3e, 0x0a, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x24, 0x7b, 0x47,
-  0x72, 0x61, 0x6d, 0x6d, 0x61, 0x72, 0x43, 0x6f, 0x6e, 0x74, 0x72, 0x6f,
-  0x6c, 0x28, 0x29, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x3c, 0x2f, 0x66, 0x69, 0x65, 0x6c, 0x64, 0x73, 0x65,
-  0x74, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x60, 0x0a, 0x20,
-  0x20, 0x20, 0x20, 0x29, 0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x43, 0x6f, 0x6d, 0x70, 0x6c,
-  0x65, 0x74, 0x69, 0x6f, 0x6e, 0x43, 0x6f, 0x6e, 0x66, 0x69, 0x67, 0x46,
-  0x6f, 0x72, 0x6d, 0x20, 0x3d, 0x20, 0x28, 0x29, 0x20, 0x3d, 0x3e, 0x20,
-  0x28, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x68, 0x74,
-  0x6d, 0x6c, 0x60, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x24, 0x7b, 0x50, 0x72, 0x6f, 0x6d, 0x70, 0x74, 0x43, 0x6f,
-  0x6e, 0x74, 0x72, 0x6f, 0x6c, 0x46, 0x69, 0x65, 0x6c, 0x64, 0x53, 0x65,
-  0x74, 0x28, 0x29, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x3c, 0x66, 0x69, 0x65, 0x6c, 0x64, 0x73, 0x65, 0x74,
-  0x3e, 0x24, 0x7b, 0x47, 0x72, 0x61, 0x6d, 0x6d, 0x61, 0x72, 0x43, 0x6f,
-  0x6e, 0x74, 0x72, 0x6f, 0x6c, 0x28, 0x29, 0x7d, 0x3c, 0x2f, 0x66, 0x69,
-  0x65, 0x6c, 0x64, 0x73, 0x65, 0x74, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x60, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x29, 0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65,
-  0x74, 0x75, 0x72, 0x6e, 0x20, 0x68, 0x74, 0x6d, 0x6c, 0x60, 0x0a, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x66, 0x6f, 0x72, 0x6d,
-  0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x3c, 0x66, 0x69, 0x65, 0x6c, 0x64, 0x73, 0x65, 0x74, 0x20, 0x63, 0x6c,
-  0x61, 0x73, 0x73, 0x3d, 0x22, 0x74, 0x77, 0x6f, 0x22, 0x3e, 0x0a, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c,
-  0x24, 0x7b, 0x55, 0x73, 0x65, 0x72, 0x54, 0x65, 0x6d, 0x70, 0x6c, 0x61,
-  0x74, 0x65, 0x52, 0x65, 0x73, 0x65, 0x74, 0x42, 0x75, 0x74, 0x74, 0x6f,
-  0x6e, 0x7d, 0x2f, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x64, 0x69, 0x76, 0x3e, 0x0a, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x3c, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x20, 0x63, 0x6c, 0x61, 0x73,
-  0x73, 0x3d, 0x22, 0x73, 0x6c, 0x69, 0x6d, 0x22, 0x3e, 0x3c, 0x69, 0x6e,
-  0x70, 0x75, 0x74, 0x20, 0x74, 0x79, 0x70, 0x65, 0x3d, 0x22, 0x72, 0x61,
-  0x64, 0x69, 0x6f, 0x22, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x3d, 0x22, 0x74,
-  0x79, 0x70, 0x65, 0x22, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x3d, 0x22,
-  0x63, 0x68, 0x61, 0x74, 0x22, 0x20, 0x63, 0x68, 0x65, 0x63, 0x6b, 0x65,
-  0x64, 0x3d, 0x24, 0x7b, 0x73, 0x65, 0x73, 0x73, 0x69, 0x6f, 0x6e, 0x2e,
-  0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e, 0x74, 0x79, 0x70, 0x65, 0x20, 0x3d,
-  0x3d, 0x3d, 0x20, 0x22, 0x63, 0x68, 0x61, 0x74, 0x22, 0x7d, 0x20, 0x6f,
-  0x6e, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x3d, 0x24, 0x7b, 0x75, 0x70, 0x64,
-  0x61, 0x74, 0x65, 0x53, 0x65, 0x73, 0x73, 0x69, 0x6f, 0x6e, 0x7d, 0x20,
-  0x2f, 0x3e, 0x20, 0x43, 0x68, 0x61, 0x74, 0x3c, 0x2f, 0x6c, 0x61, 0x62,
-  0x65, 0x6c, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x6c, 0x61, 0x62, 0x65, 0x6c,
-  0x20, 0x63, 0x6c, 0x61, 0x73, 0x73, 0x3d, 0x22, 0x73, 0x6c, 0x69, 0x6d,
-  0x22, 0x3e, 0x3c, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x20, 0x74, 0x79, 0x70,
-  0x65, 0x3d, 0x22, 0x72, 0x61, 0x64, 0x69, 0x6f, 0x22, 0x20, 0x6e, 0x61,
-  0x6d, 0x65, 0x3d, 0x22, 0x74, 0x79, 0x70, 0x65, 0x22, 0x20, 0x76, 0x61,
-  0x6c, 0x75, 0x65, 0x3d, 0x22, 0x63, 0x6f, 0x6d, 0x70, 0x6c, 0x65, 0x74,
-  0x69, 0x6f, 0x6e, 0x22, 0x20, 0x63, 0x68, 0x65, 0x63, 0x6b, 0x65, 0x64,
-  0x3d, 0x24, 0x7b, 0x73, 0x65, 0x73, 0x73, 0x69, 0x6f, 0x6e, 0x2e, 0x76,
-  0x61, 0x6c, 0x75, 0x65, 0x2e, 0x74, 0x79, 0x70, 0x65, 0x20, 0x3d, 0x3d,
-  0x3d, 0x20, 0x22, 0x63, 0x6f, 0x6d, 0x70, 0x6c, 0x65, 0x74, 0x69, 0x6f,
-  0x6e, 0x22, 0x7d, 0x20, 0x6f, 0x6e, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x3d,
-  0x24, 0x7b, 0x75, 0x70, 0x64, 0x61, 0x74, 0x65, 0x53, 0x65, 0x73, 0x73,
-  0x69, 0x6f, 0x6e, 0x7d, 0x20, 0x2f, 0x3e, 0x20, 0x43, 0x6f, 0x6d, 0x70,
-  0x6c, 0x65, 0x74, 0x69, 0x6f, 0x6e, 0x3c, 0x2f, 0x6c, 0x61, 0x62, 0x65,
-  0x6c, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x3c, 0x2f, 0x64, 0x69, 0x76, 0x3e, 0x0a, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x2f, 0x66, 0x69,
-  0x65, 0x6c, 0x64, 0x73, 0x65, 0x74, 0x3e, 0x0a, 0x0a, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x24, 0x7b, 0x73, 0x65, 0x73,
-  0x73, 0x69, 0x6f, 0x6e, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e, 0x74,
-  0x79, 0x70, 0x65, 0x20, 0x3d, 0x3d, 0x3d, 0x20, 0x27, 0x63, 0x68, 0x61,
-  0x74, 0x27, 0x20, 0x3f, 0x20, 0x43, 0x68, 0x61, 0x74, 0x43, 0x6f, 0x6e,
-  0x66, 0x69, 0x67, 0x46, 0x6f, 0x72, 0x6d, 0x28, 0x29, 0x20, 0x3a, 0x20,
-  0x43, 0x6f, 0x6d, 0x70, 0x6c, 0x65, 0x74, 0x69, 0x6f, 0x6e, 0x43, 0x6f,
-  0x6e, 0x66, 0x69, 0x67, 0x46, 0x6f, 0x72, 0x6d, 0x28, 0x29, 0x7d, 0x0a,
-  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c,
-  0x66, 0x69, 0x65, 0x6c, 0x64, 0x73, 0x65, 0x74, 0x20, 0x63, 0x6c, 0x61,
-  0x73, 0x73, 0x3d, 0x22, 0x74, 0x77, 0x6f, 0x22, 0x3e, 0x0a, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x24, 0x7b,
-  0x49, 0x6e, 0x74, 0x46, 0x69, 0x65, 0x6c, 0x64, 0x28, 0x7b, 0x6c, 0x61,
-  0x62, 0x65, 0x6c, 0x3a, 0x20, 0x22, 0x50, 0x72, 0x65, 0x64, 0x69, 0x63,
-  0x74, 0x69, 0x6f, 0x6e, 0x73, 0x22, 0x2c, 0x20, 0x6d, 0x61, 0x78, 0x3a,
-  0x20, 0x32, 0x30, 0x34, 0x38, 0x2c, 0x20, 0x6d, 0x69, 0x6e, 0x3a, 0x20,
-  0x2d, 0x31, 0x2c, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x3a, 0x20, 0x22, 0x6e,
-  0x5f, 0x70, 0x72, 0x65, 0x64, 0x69, 0x63, 0x74, 0x22, 0x2c, 0x20, 0x76,
-  0x61, 0x6c, 0x75, 0x65, 0x3a, 0x20, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73,
-  0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e, 0x6e, 0x5f, 0x70, 0x72, 0x65,
-  0x64, 0x69, 0x63, 0x74, 0x7d, 0x29, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x24, 0x7b, 0x46, 0x6c,
-  0x6f, 0x61, 0x74, 0x46, 0x69, 0x65, 0x6c, 0x64, 0x28, 0x7b, 0x6c, 0x61,
-  0x62, 0x65, 0x6c, 0x3a, 0x20, 0x22, 0x54, 0x65, 0x6d, 0x70, 0x65, 0x72,
-  0x61, 0x74, 0x75, 0x72, 0x65, 0x22, 0x2c, 0x20, 0x6d, 0x61, 0x78, 0x3a,
-  0x20, 0x31, 0x2e, 0x35, 0x2c, 0x20, 0x6d, 0x69, 0x6e, 0x3a, 0x20, 0x30,
-  0x2e, 0x30, 0x2c, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x3a, 0x20, 0x22, 0x74,
-  0x65, 0x6d, 0x70, 0x65, 0x72, 0x61, 0x74, 0x75, 0x72, 0x65, 0x22, 0x2c,
-  0x20, 0x73, 0x74, 0x65, 0x70, 0x3a, 0x20, 0x30, 0x2e, 0x30, 0x31, 0x2c,
-  0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x3a, 0x20, 0x70, 0x61, 0x72, 0x61,
-  0x6d, 0x73, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e, 0x74, 0x65, 0x6d,
-  0x70, 0x65, 0x72, 0x61, 0x74, 0x75, 0x72, 0x65, 0x7d, 0x29, 0x7d, 0x0a,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x24, 0x7b, 0x46, 0x6c, 0x6f, 0x61, 0x74, 0x46, 0x69, 0x65, 0x6c, 0x64,
-  0x28, 0x7b, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x3a, 0x20, 0x22, 0x50, 0x65,
-  0x6e, 0x61, 0x6c, 0x69, 0x7a, 0x65, 0x20, 0x72, 0x65, 0x70, 0x65, 0x61,
-  0x74, 0x20, 0x73, 0x65, 0x71, 0x75, 0x65, 0x6e, 0x63, 0x65, 0x22, 0x2c,
-  0x20, 0x6d, 0x61, 0x78, 0x3a, 0x20, 0x32, 0x2e, 0x30, 0x2c, 0x20, 0x6d,
-  0x69, 0x6e, 0x3a, 0x20, 0x30, 0x2e, 0x30, 0x2c, 0x20, 0x6e, 0x61, 0x6d,
-  0x65, 0x3a, 0x20, 0x22, 0x72, 0x65, 0x70, 0x65, 0x61, 0x74, 0x5f, 0x70,
-  0x65, 0x6e, 0x61, 0x6c, 0x74, 0x79, 0x22, 0x2c, 0x20, 0x73, 0x74, 0x65,
-  0x70, 0x3a, 0x20, 0x30, 0x2e, 0x30, 0x31, 0x2c, 0x20, 0x76, 0x61, 0x6c,
-  0x75, 0x65, 0x3a, 0x20, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2e, 0x76,
-  0x61, 0x6c, 0x75, 0x65, 0x2e, 0x72, 0x65, 0x70, 0x65, 0x61, 0x74, 0x5f,
-  0x70, 0x65, 0x6e, 0x61, 0x6c, 0x74, 0x79, 0x7d, 0x29, 0x7d, 0x0a, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x24,
-  0x7b, 0x49, 0x6e, 0x74, 0x46, 0x69, 0x65, 0x6c, 0x64, 0x28, 0x7b, 0x6c,
-  0x61, 0x62, 0x65, 0x6c, 0x3a, 0x20, 0x22, 0x43, 0x6f, 0x6e, 0x73, 0x69,
-  0x64, 0x65, 0x72, 0x20, 0x4e, 0x20, 0x74, 0x6f, 0x6b, 0x65, 0x6e, 0x73,
-  0x20, 0x66, 0x6f, 0x72, 0x20, 0x70, 0x65, 0x6e, 0x61, 0x6c, 0x69, 0x7a,
-  0x65, 0x22, 0x2c, 0x20, 0x6d, 0x61, 0x78, 0x3a, 0x20, 0x32, 0x30, 0x34,
-  0x38, 0x2c, 0x20, 0x6d, 0x69, 0x6e, 0x3a, 0x20, 0x30, 0x2c, 0x20, 0x6e,
-  0x61, 0x6d, 0x65, 0x3a, 0x20, 0x22, 0x72, 0x65, 0x70, 0x65, 0x61, 0x74,
-  0x5f, 0x6c, 0x61, 0x73, 0x74, 0x5f, 0x6e, 0x22, 0x2c, 0x20, 0x76, 0x61,
-  0x6c, 0x75, 0x65, 0x3a, 0x20, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2e,
-  0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e, 0x72, 0x65, 0x70, 0x65, 0x61, 0x74,
-  0x5f, 0x6c, 0x61, 0x73, 0x74, 0x5f, 0x6e, 0x7d, 0x29, 0x7d, 0x0a, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x24,
-  0x7b, 0x49, 0x6e, 0x74, 0x46, 0x69, 0x65, 0x6c, 0x64, 0x28, 0x7b, 0x6c,
-  0x61, 0x62, 0x65, 0x6c, 0x3a, 0x20, 0x22, 0x54, 0x6f, 0x70, 0x2d, 0x4b,
-  0x20, 0x73, 0x61, 0x6d, 0x70, 0x6c, 0x69, 0x6e, 0x67, 0x22, 0x2c, 0x20,
-  0x6d, 0x61, 0x78, 0x3a, 0x20, 0x31, 0x30, 0x30, 0x2c, 0x20, 0x6d, 0x69,
-  0x6e, 0x3a, 0x20, 0x2d, 0x31, 0x2c, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x3a,
-  0x20, 0x22, 0x74, 0x6f, 0x70, 0x5f, 0x6b, 0x22, 0x2c, 0x20, 0x76, 0x61,
-  0x6c, 0x75, 0x65, 0x3a, 0x20, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2e,
-  0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e, 0x74, 0x6f, 0x70, 0x5f, 0x6b, 0x7d,
-  0x29, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x24, 0x7b, 0x46, 0x6c, 0x6f, 0x61, 0x74, 0x46, 0x69,
-  0x65, 0x6c, 0x64, 0x28, 0x7b, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x3a, 0x20,
-  0x22, 0x54, 0x6f, 0x70, 0x2d, 0x50, 0x20, 0x73, 0x61, 0x6d, 0x70, 0x6c,
-  0x69, 0x6e, 0x67, 0x22, 0x2c, 0x20, 0x6d, 0x61, 0x78, 0x3a, 0x20, 0x31,
-  0x2e, 0x30, 0x2c, 0x20, 0x6d, 0x69, 0x6e, 0x3a, 0x20, 0x30, 0x2e, 0x30,
-  0x2c, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x3a, 0x20, 0x22, 0x74, 0x6f, 0x70,
-  0x5f, 0x70, 0x22, 0x2c, 0x20, 0x73, 0x74, 0x65, 0x70, 0x3a, 0x20, 0x30,
-  0x2e, 0x30, 0x31, 0x2c, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x3a, 0x20,
-  0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65,
-  0x2e, 0x74, 0x6f, 0x70, 0x5f, 0x70, 0x7d, 0x29, 0x7d, 0x0a, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x2f, 0x66, 0x69,
-  0x65, 0x6c, 0x64, 0x73, 0x65, 0x74, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x64, 0x65, 0x74, 0x61, 0x69,
-  0x6c, 0x73, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x3c, 0x73, 0x75, 0x6d, 0x6d, 0x61, 0x72, 0x79,
-  0x3e, 0x4d, 0x6f, 0x72, 0x65, 0x20, 0x6f, 0x70, 0x74, 0x69, 0x6f, 0x6e,
-  0x73, 0x3c, 0x2f, 0x73, 0x75, 0x6d, 0x6d, 0x61, 0x72, 0x79, 0x3e, 0x0a,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x3c, 0x66, 0x69, 0x65, 0x6c, 0x64, 0x73, 0x65, 0x74, 0x20, 0x63, 0x6c,
-  0x61, 0x73, 0x73, 0x3d, 0x22, 0x74, 0x77, 0x6f, 0x22, 0x3e, 0x0a, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x24, 0x7b, 0x46, 0x6c, 0x6f, 0x61, 0x74, 0x46, 0x69, 0x65, 0x6c,
-  0x64, 0x28, 0x7b, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x3a, 0x20, 0x22, 0x54,
-  0x46, 0x53, 0x2d, 0x5a, 0x22, 0x2c, 0x20, 0x6d, 0x61, 0x78, 0x3a, 0x20,
-  0x31, 0x2e, 0x30, 0x2c, 0x20, 0x6d, 0x69, 0x6e, 0x3a, 0x20, 0x30, 0x2e,
-  0x30, 0x2c, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x3a, 0x20, 0x22, 0x74, 0x66,
-  0x73, 0x5f, 0x7a, 0x22, 0x2c, 0x20, 0x73, 0x74, 0x65, 0x70, 0x3a, 0x20,
-  0x30, 0x2e, 0x30, 0x31, 0x2c, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x3a,
-  0x20, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2e, 0x76, 0x61, 0x6c, 0x75,
-  0x65, 0x2e, 0x74, 0x66, 0x73, 0x5f, 0x7a, 0x7d, 0x29, 0x7d, 0x0a, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x24, 0x7b, 0x46, 0x6c, 0x6f, 0x61, 0x74, 0x46, 0x69, 0x65, 0x6c,
-  0x64, 0x28, 0x7b, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x3a, 0x20, 0x22, 0x54,
-  0x79, 0x70, 0x69, 0x63, 0x61, 0x6c, 0x20, 0x50, 0x22, 0x2c, 0x20, 0x6d,
-  0x61, 0x78, 0x3a, 0x20, 0x31, 0x2e, 0x30, 0x2c, 0x20, 0x6d, 0x69, 0x6e,
-  0x3a, 0x20, 0x30, 0x2e, 0x30, 0x2c, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x3a,
-  0x20, 0x22, 0x74, 0x79, 0x70, 0x69, 0x63, 0x61, 0x6c, 0x5f, 0x70, 0x22,
-  0x2c, 0x20, 0x73, 0x74, 0x65, 0x70, 0x3a, 0x20, 0x30, 0x2e, 0x30, 0x31,
-  0x2c, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x3a, 0x20, 0x70, 0x61, 0x72,
-  0x61, 0x6d, 0x73, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e, 0x74, 0x79,
-  0x70, 0x69, 0x63, 0x61, 0x6c, 0x5f, 0x70, 0x7d, 0x29, 0x7d, 0x0a, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x24, 0x7b, 0x46, 0x6c, 0x6f, 0x61, 0x74, 0x46, 0x69, 0x65, 0x6c,
-  0x64, 0x28, 0x7b, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x3a, 0x20, 0x22, 0x50,
-  0x72, 0x65, 0x73, 0x65, 0x6e, 0x63, 0x65, 0x20, 0x70, 0x65, 0x6e, 0x61,
-  0x6c, 0x74, 0x79, 0x22, 0x2c, 0x20, 0x6d, 0x61, 0x78, 0x3a, 0x20, 0x31,
-  0x2e, 0x30, 0x2c, 0x20, 0x6d, 0x69, 0x6e, 0x3a, 0x20, 0x30, 0x2e, 0x30,
-  0x2c, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x3a, 0x20, 0x22, 0x70, 0x72, 0x65,
-  0x73, 0x65, 0x6e, 0x63, 0x65, 0x5f, 0x70, 0x65, 0x6e, 0x61, 0x6c, 0x74,
-  0x79, 0x22, 0x2c, 0x20, 0x73, 0x74, 0x65, 0x70, 0x3a, 0x20, 0x30, 0x2e,
-  0x30, 0x31, 0x2c, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x3a, 0x20, 0x70,
-  0x61, 0x72, 0x61, 0x6d, 0x73, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e,
-  0x70, 0x72, 0x65, 0x73, 0x65, 0x6e, 0x63, 0x65, 0x5f, 0x70, 0x65, 0x6e,
-  0x61, 0x6c, 0x74, 0x79, 0x7d, 0x29, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x24, 0x7b,
-  0x46, 0x6c, 0x6f, 0x61, 0x74, 0x46, 0x69, 0x65, 0x6c, 0x64, 0x28, 0x7b,
-  0x6c, 0x61, 0x62, 0x65, 0x6c, 0x3a, 0x20, 0x22, 0x46, 0x72, 0x65, 0x71,
-  0x75, 0x65, 0x6e, 0x63, 0x79, 0x20, 0x70, 0x65, 0x6e, 0x61, 0x6c, 0x74,
-  0x79, 0x22, 0x2c, 0x20, 0x6d, 0x61, 0x78, 0x3a, 0x20, 0x31, 0x2e, 0x30,
-  0x2c, 0x20, 0x6d, 0x69, 0x6e, 0x3a, 0x20, 0x30, 0x2e, 0x30, 0x2c, 0x20,
-  0x6e, 0x61, 0x6d, 0x65, 0x3a, 0x20, 0x22, 0x66, 0x72, 0x65, 0x71, 0x75,
-  0x65, 0x6e, 0x63, 0x79, 0x5f, 0x70, 0x65, 0x6e, 0x61, 0x6c, 0x74, 0x79,
-  0x22, 0x2c, 0x20, 0x73, 0x74, 0x65, 0x70, 0x3a, 0x20, 0x30, 0x2e, 0x30,
-  0x31, 0x2c, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x3a, 0x20, 0x70, 0x61,
-  0x72, 0x61, 0x6d, 0x73, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e, 0x66,
-  0x72, 0x65, 0x71, 0x75, 0x65, 0x6e, 0x63, 0x79, 0x5f, 0x70, 0x65, 0x6e,
-  0x61, 0x6c, 0x74, 0x79, 0x7d, 0x29, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x2f, 0x66, 0x69,
-  0x65, 0x6c, 0x64, 0x73, 0x65, 0x74, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x68, 0x72, 0x20,
-  0x2f, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x3c, 0x66, 0x69, 0x65, 0x6c, 0x64, 0x73, 0x65, 0x74,
-  0x20, 0x63, 0x6c, 0x61, 0x73, 0x73, 0x3d, 0x22, 0x74, 0x68, 0x72, 0x65,
-  0x65, 0x22, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x64, 0x69, 0x76, 0x3e, 0x0a,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x3c, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x3e, 0x3c,
-  0x69, 0x6e, 0x70, 0x75, 0x74, 0x20, 0x74, 0x79, 0x70, 0x65, 0x3d, 0x22,
-  0x72, 0x61, 0x64, 0x69, 0x6f, 0x22, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x3d,
-  0x22, 0x6d, 0x69, 0x72, 0x6f, 0x73, 0x74, 0x61, 0x74, 0x22, 0x20, 0x76,
-  0x61, 0x6c, 0x75, 0x65, 0x3d, 0x22, 0x30, 0x22, 0x20, 0x63, 0x68, 0x65,
-  0x63, 0x6b, 0x65, 0x64, 0x3d, 0x24, 0x7b, 0x70, 0x61, 0x72, 0x61, 0x6d,
-  0x73, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e, 0x6d, 0x69, 0x72, 0x6f,
-  0x73, 0x74, 0x61, 0x74, 0x20, 0x3d, 0x3d, 0x20, 0x30, 0x7d, 0x20, 0x6f,
-  0x6e, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x3d, 0x24, 0x7b, 0x75, 0x70, 0x64,
-  0x61, 0x74, 0x65, 0x50, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x49, 0x6e, 0x74,
-  0x7d, 0x20, 0x2f, 0x3e, 0x20, 0x6e, 0x6f, 0x20, 0x4d, 0x69, 0x72, 0x6f,
-  0x73, 0x74, 0x61, 0x74, 0x3c, 0x2f, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x3e,
-  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x3e,
-  0x3c, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x20, 0x74, 0x79, 0x70, 0x65, 0x3d,
-  0x22, 0x72, 0x61, 0x64, 0x69, 0x6f, 0x22, 0x20, 0x6e, 0x61, 0x6d, 0x65,
-  0x3d, 0x22, 0x6d, 0x69, 0x72, 0x6f, 0x73, 0x74, 0x61, 0x74, 0x22, 0x20,
-  0x76, 0x61, 0x6c, 0x75, 0x65, 0x3d, 0x22, 0x31, 0x22, 0x20, 0x63, 0x68,
-  0x65, 0x63, 0x6b, 0x65, 0x64, 0x3d, 0x24, 0x7b, 0x70, 0x61, 0x72, 0x61,
-  0x6d, 0x73, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e, 0x6d, 0x69, 0x72,
-  0x6f, 0x73, 0x74, 0x61, 0x74, 0x20, 0x3d, 0x3d, 0x20, 0x31, 0x7d, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x20,
+  0x74, 0x79, 0x70, 0x65, 0x3d, 0x22, 0x72, 0x61, 0x6e, 0x67, 0x65, 0x22,
+  0x20, 0x69, 0x64, 0x3d, 0x22, 0x24, 0x7b, 0x6e, 0x61, 0x6d, 0x65, 0x7d,
+  0x22, 0x20, 0x6d, 0x69, 0x6e, 0x3d, 0x22, 0x24, 0x7b, 0x6d, 0x69, 0x6e,
+  0x7d, 0x22, 0x20, 0x6d, 0x61, 0x78, 0x3d, 0x22, 0x24, 0x7b, 0x6d, 0x61,
+  0x78, 0x7d, 0x22, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x3d, 0x22, 0x24, 0x7b,
+  0x6e, 0x61, 0x6d, 0x65, 0x7d, 0x22, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65,
+  0x3d, 0x22, 0x24, 0x7b, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x7d, 0x22, 0x20,
   0x6f, 0x6e, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x3d, 0x24, 0x7b, 0x75, 0x70,
   0x64, 0x61, 0x74, 0x65, 0x50, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x49, 0x6e,
-  0x74, 0x7d, 0x20, 0x2f, 0x3e, 0x20, 0x4d, 0x69, 0x72, 0x6f, 0x73, 0x74,
-  0x61, 0x74, 0x20, 0x76, 0x31, 0x3c, 0x2f, 0x6c, 0x61, 0x62, 0x65, 0x6c,
-  0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x6c, 0x61, 0x62, 0x65, 0x6c,
-  0x3e, 0x3c, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x20, 0x74, 0x79, 0x70, 0x65,
-  0x3d, 0x22, 0x72, 0x61, 0x64, 0x69, 0x6f, 0x22, 0x20, 0x6e, 0x61, 0x6d,
-  0x65, 0x3d, 0x22, 0x6d, 0x69, 0x72, 0x6f, 0x73, 0x74, 0x61, 0x74, 0x22,
-  0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x3d, 0x22, 0x32, 0x22, 0x20, 0x63,
-  0x68, 0x65, 0x63, 0x6b, 0x65, 0x64, 0x3d, 0x24, 0x7b, 0x70, 0x61, 0x72,
-  0x61, 0x6d, 0x73, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e, 0x6d, 0x69,
-  0x72, 0x6f, 0x73, 0x74, 0x61, 0x74, 0x20, 0x3d, 0x3d, 0x20, 0x32, 0x7d,
-  0x20, 0x6f, 0x6e, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x3d, 0x24, 0x7b, 0x75,
-  0x70, 0x64, 0x61, 0x74, 0x65, 0x50, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x49,
-  0x6e, 0x74, 0x7d, 0x20, 0x2f, 0x3e, 0x20, 0x4d, 0x69, 0x72, 0x6f, 0x73,
-  0x74, 0x61, 0x74, 0x20, 0x76, 0x32, 0x3c, 0x2f, 0x6c, 0x61, 0x62, 0x65,
-  0x6c, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x2f, 0x64, 0x69, 0x76, 0x3e, 0x0a,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x24, 0x7b, 0x46, 0x6c, 0x6f, 0x61, 0x74, 0x46, 0x69, 0x65,
-  0x6c, 0x64, 0x28, 0x7b, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x3a, 0x20, 0x22,
-  0x4d, 0x69, 0x72, 0x6f, 0x73, 0x74, 0x61, 0x74, 0x20, 0x74, 0x61, 0x75,
-  0x22, 0x2c, 0x20, 0x6d, 0x61, 0x78, 0x3a, 0x20, 0x31, 0x30, 0x2e, 0x30,
-  0x2c, 0x20, 0x6d, 0x69, 0x6e, 0x3a, 0x20, 0x30, 0x2e, 0x30, 0x2c, 0x20,
-  0x6e, 0x61, 0x6d, 0x65, 0x3a, 0x20, 0x22, 0x6d, 0x69, 0x72, 0x6f, 0x73,
-  0x74, 0x61, 0x74, 0x5f, 0x74, 0x61, 0x75, 0x22, 0x2c, 0x20, 0x73, 0x74,
-  0x65, 0x70, 0x3a, 0x20, 0x30, 0x2e, 0x30, 0x31, 0x2c, 0x20, 0x76, 0x61,
-  0x6c, 0x75, 0x65, 0x3a, 0x20, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2e,
-  0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e, 0x6d, 0x69, 0x72, 0x6f, 0x73, 0x74,
-  0x61, 0x74, 0x5f, 0x74, 0x61, 0x75, 0x7d, 0x29, 0x7d, 0x0a, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x24, 0x7b, 0x46, 0x6c, 0x6f, 0x61, 0x74, 0x46, 0x69, 0x65, 0x6c, 0x64,
-  0x28, 0x7b, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x3a, 0x20, 0x22, 0x4d, 0x69,
-  0x72, 0x6f, 0x73, 0x74, 0x61, 0x74, 0x20, 0x65, 0x74, 0x61, 0x22, 0x2c,
-  0x20, 0x6d, 0x61, 0x78, 0x3a, 0x20, 0x31, 0x2e, 0x30, 0x2c, 0x20, 0x6d,
-  0x69, 0x6e, 0x3a, 0x20, 0x30, 0x2e, 0x30, 0x2c, 0x20, 0x6e, 0x61, 0x6d,
-  0x65, 0x3a, 0x20, 0x22, 0x6d, 0x69, 0x72, 0x6f, 0x73, 0x74, 0x61, 0x74,
-  0x5f, 0x65, 0x74, 0x61, 0x22, 0x2c, 0x20, 0x73, 0x74, 0x65, 0x70, 0x3a,
-  0x20, 0x30, 0x2e, 0x30, 0x31, 0x2c, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65,
-  0x3a, 0x20, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2e, 0x76, 0x61, 0x6c,
-  0x75, 0x65, 0x2e, 0x6d, 0x69, 0x72, 0x6f, 0x73, 0x74, 0x61, 0x74, 0x5f,
-  0x65, 0x74, 0x61, 0x7d, 0x29, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x2f, 0x66, 0x69, 0x65,
-  0x6c, 0x64, 0x73, 0x65, 0x74, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x66, 0x69, 0x65, 0x6c,
-  0x64, 0x73, 0x65, 0x74, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x24, 0x7b, 0x49, 0x6e,
-  0x74, 0x46, 0x69, 0x65, 0x6c, 0x64, 0x28, 0x7b, 0x6c, 0x61, 0x62, 0x65,
-  0x6c, 0x3a, 0x20, 0x22, 0x53, 0x68, 0x6f, 0x77, 0x20, 0x50, 0x72, 0x6f,
-  0x62, 0x61, 0x62, 0x69, 0x6c, 0x69, 0x74, 0x69, 0x65, 0x73, 0x22, 0x2c,
-  0x20, 0x6d, 0x61, 0x78, 0x3a, 0x20, 0x31, 0x30, 0x2c, 0x20, 0x6d, 0x69,
-  0x6e, 0x3a, 0x20, 0x30, 0x2c, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x3a, 0x20,
-  0x22, 0x6e, 0x5f, 0x70, 0x72, 0x6f, 0x62, 0x73, 0x22, 0x2c, 0x20, 0x76,
-  0x61, 0x6c, 0x75, 0x65, 0x3a, 0x20, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73,
-  0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e, 0x6e, 0x5f, 0x70, 0x72, 0x6f,
-  0x62, 0x73, 0x7d, 0x29, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x2f, 0x66, 0x69, 0x65, 0x6c,
-  0x64, 0x73, 0x65, 0x74, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x3c, 0x2f, 0x64, 0x65, 0x74, 0x61, 0x69, 0x6c,
-  0x73, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c,
-  0x2f, 0x66, 0x6f, 0x72, 0x6d, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x60, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20,
-  0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x70, 0x72, 0x6f, 0x62,
-  0x43, 0x6f, 0x6c, 0x6f, 0x72, 0x20, 0x3d, 0x20, 0x28, 0x70, 0x29, 0x20,
-  0x3d, 0x3e, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63,
-  0x6f, 0x6e, 0x73, 0x74, 0x20, 0x72, 0x20, 0x3d, 0x20, 0x4d, 0x61, 0x74,
-  0x68, 0x2e, 0x66, 0x6c, 0x6f, 0x6f, 0x72, 0x28, 0x31, 0x39, 0x32, 0x20,
-  0x2a, 0x20, 0x28, 0x31, 0x20, 0x2d, 0x20, 0x70, 0x29, 0x29, 0x3b, 0x0a,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20,
-  0x67, 0x20, 0x3d, 0x20, 0x4d, 0x61, 0x74, 0x68, 0x2e, 0x66, 0x6c, 0x6f,
-  0x6f, 0x72, 0x28, 0x31, 0x39, 0x32, 0x20, 0x2a, 0x20, 0x70, 0x29, 0x3b,
-  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72,
-  0x6e, 0x20, 0x60, 0x72, 0x67, 0x62, 0x61, 0x28, 0x24, 0x7b, 0x72, 0x7d,
-  0x2c, 0x24, 0x7b, 0x67, 0x7d, 0x2c, 0x30, 0x2c, 0x30, 0x2e, 0x33, 0x29,
-  0x60, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20,
-  0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x50, 0x72, 0x6f, 0x62,
-  0x61, 0x62, 0x69, 0x6c, 0x69, 0x74, 0x69, 0x65, 0x73, 0x20, 0x3d, 0x20,
-  0x28, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x29, 0x20, 0x3d, 0x3e, 0x20,
-  0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75,
-  0x72, 0x6e, 0x20, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2e, 0x64, 0x61,
-  0x74, 0x61, 0x2e, 0x6d, 0x61, 0x70, 0x28, 0x6d, 0x73, 0x67, 0x20, 0x3d,
-  0x3e, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x7b, 0x20, 0x63, 0x6f, 0x6d, 0x70,
-  0x6c, 0x65, 0x74, 0x69, 0x6f, 0x6e, 0x5f, 0x70, 0x72, 0x6f, 0x62, 0x61,
-  0x62, 0x69, 0x6c, 0x69, 0x74, 0x69, 0x65, 0x73, 0x20, 0x7d, 0x20, 0x3d,
-  0x20, 0x6d, 0x73, 0x67, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x21, 0x63, 0x6f, 0x6d, 0x70, 0x6c, 0x65,
-  0x74, 0x69, 0x6f, 0x6e, 0x5f, 0x70, 0x72, 0x6f, 0x62, 0x61, 0x62, 0x69,
-  0x6c, 0x69, 0x74, 0x69, 0x65, 0x73, 0x20, 0x7c, 0x7c, 0x0a, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6d, 0x70,
-  0x6c, 0x65, 0x74, 0x69, 0x6f, 0x6e, 0x5f, 0x70, 0x72, 0x6f, 0x62, 0x61,
-  0x62, 0x69, 0x6c, 0x69, 0x74, 0x69, 0x65, 0x73, 0x2e, 0x6c, 0x65, 0x6e,
-  0x67, 0x74, 0x68, 0x20, 0x3d, 0x3d, 0x3d, 0x20, 0x30, 0x0a, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x29, 0x20, 0x72, 0x65, 0x74, 0x75,
-  0x72, 0x6e, 0x20, 0x6d, 0x73, 0x67, 0x2e, 0x63, 0x6f, 0x6e, 0x74, 0x65,
-  0x6e, 0x74, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x69, 0x66, 0x20, 0x28, 0x63, 0x6f, 0x6d, 0x70, 0x6c, 0x65, 0x74, 0x69,
-  0x6f, 0x6e, 0x5f, 0x70, 0x72, 0x6f, 0x62, 0x61, 0x62, 0x69, 0x6c, 0x69,
-  0x74, 0x69, 0x65, 0x73, 0x2e, 0x6c, 0x65, 0x6e, 0x67, 0x74, 0x68, 0x20,
-  0x3e, 0x20, 0x31, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f, 0x20, 0x4e, 0x6f, 0x74, 0x20,
-  0x66, 0x6f, 0x72, 0x20, 0x62, 0x79, 0x74, 0x65, 0x20, 0x70, 0x61, 0x69,
-  0x72, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x69, 0x66, 0x20, 0x28, 0x63, 0x6f, 0x6d, 0x70, 0x6c, 0x65, 0x74, 0x69,
-  0x6f, 0x6e, 0x5f, 0x70, 0x72, 0x6f, 0x62, 0x61, 0x62, 0x69, 0x6c, 0x69,
-  0x74, 0x69, 0x65, 0x73, 0x5b, 0x30, 0x5d, 0x2e, 0x63, 0x6f, 0x6e, 0x74,
-  0x65, 0x6e, 0x74, 0x2e, 0x73, 0x74, 0x61, 0x72, 0x74, 0x73, 0x57, 0x69,
-  0x74, 0x68, 0x28, 0x27, 0x62, 0x79, 0x74, 0x65, 0x3a, 0x20, 0x5c, 0x5c,
-  0x27, 0x29, 0x29, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x6d,
-  0x73, 0x67, 0x2e, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x0a, 0x0a,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f,
-  0x6e, 0x73, 0x74, 0x20, 0x73, 0x70, 0x6c, 0x69, 0x74, 0x44, 0x61, 0x74,
-  0x61, 0x20, 0x3d, 0x20, 0x63, 0x6f, 0x6d, 0x70, 0x6c, 0x65, 0x74, 0x69,
-  0x6f, 0x6e, 0x5f, 0x70, 0x72, 0x6f, 0x62, 0x61, 0x62, 0x69, 0x6c, 0x69,
-  0x74, 0x69, 0x65, 0x73, 0x2e, 0x6d, 0x61, 0x70, 0x28, 0x70, 0x72, 0x6f,
-  0x62, 0x20, 0x3d, 0x3e, 0x20, 0x28, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x74,
-  0x65, 0x6e, 0x74, 0x3a, 0x20, 0x70, 0x72, 0x6f, 0x62, 0x2e, 0x63, 0x6f,
-  0x6e, 0x74, 0x65, 0x6e, 0x74, 0x2c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6d, 0x70, 0x6c,
-  0x65, 0x74, 0x69, 0x6f, 0x6e, 0x5f, 0x70, 0x72, 0x6f, 0x62, 0x61, 0x62,
-  0x69, 0x6c, 0x69, 0x74, 0x69, 0x65, 0x73, 0x3a, 0x20, 0x5b, 0x70, 0x72,
-  0x6f, 0x62, 0x5d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x7d, 0x29, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x74, 0x7d, 0x20, 0x2f, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x73, 0x70, 0x61, 0x6e, 0x3e,
+  0x24, 0x7b, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x7d, 0x3c, 0x2f, 0x73, 0x70,
+  0x61, 0x6e, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x3c, 0x2f, 0x64, 0x69, 0x76, 0x3e, 0x0a, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x60, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x7d, 0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63,
+  0x6f, 0x6e, 0x73, 0x74, 0x20, 0x75, 0x73, 0x65, 0x72, 0x54, 0x65, 0x6d,
+  0x70, 0x6c, 0x61, 0x74, 0x65, 0x52, 0x65, 0x73, 0x65, 0x74, 0x20, 0x3d,
+  0x20, 0x28, 0x65, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x65, 0x2e, 0x70, 0x72, 0x65, 0x76,
+  0x65, 0x6e, 0x74, 0x44, 0x65, 0x66, 0x61, 0x75, 0x6c, 0x74, 0x28, 0x29,
+  0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x75, 0x73,
+  0x65, 0x72, 0x54, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x52, 0x65,
+  0x73, 0x65, 0x74, 0x54, 0x6f, 0x44, 0x65, 0x66, 0x61, 0x75, 0x6c, 0x74,
+  0x41, 0x6e, 0x64, 0x41, 0x70, 0x70, 0x6c, 0x79, 0x28, 0x29, 0x0a, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x55, 0x73, 0x65, 0x72,
+  0x54, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x52, 0x65, 0x73, 0x65,
+  0x74, 0x42, 0x75, 0x74, 0x74, 0x6f, 0x6e, 0x20, 0x3d, 0x20, 0x28, 0x29,
+  0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x73, 0x65, 0x6c, 0x65, 0x63, 0x74,
+  0x65, 0x64, 0x55, 0x73, 0x65, 0x72, 0x54, 0x65, 0x6d, 0x70, 0x6c, 0x61,
+  0x74, 0x65, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e, 0x6e, 0x61, 0x6d,
+  0x65, 0x20, 0x3d, 0x3d, 0x20, 0x27, 0x64, 0x65, 0x66, 0x61, 0x75, 0x6c,
+  0x74, 0x27, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
   0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x68,
-  0x74, 0x6d, 0x6c, 0x60, 0x3c, 0x24, 0x7b, 0x50, 0x72, 0x6f, 0x62, 0x61,
-  0x62, 0x69, 0x6c, 0x69, 0x74, 0x69, 0x65, 0x73, 0x7d, 0x20, 0x64, 0x61,
-  0x74, 0x61, 0x3d, 0x24, 0x7b, 0x73, 0x70, 0x6c, 0x69, 0x74, 0x44, 0x61,
-  0x74, 0x61, 0x7d, 0x20, 0x2f, 0x3e, 0x60, 0x0a, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x7b, 0x20, 0x70,
-  0x72, 0x6f, 0x62, 0x73, 0x2c, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e,
-  0x74, 0x20, 0x7d, 0x20, 0x3d, 0x20, 0x63, 0x6f, 0x6d, 0x70, 0x6c, 0x65,
-  0x74, 0x69, 0x6f, 0x6e, 0x5f, 0x70, 0x72, 0x6f, 0x62, 0x61, 0x62, 0x69,
-  0x6c, 0x69, 0x74, 0x69, 0x65, 0x73, 0x5b, 0x30, 0x5d, 0x0a, 0x20, 0x20,
+  0x74, 0x6d, 0x6c, 0x60, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x62, 0x75, 0x74, 0x74, 0x6f, 0x6e,
+  0x20, 0x64, 0x69, 0x73, 0x61, 0x62, 0x6c, 0x65, 0x64, 0x3e, 0x55, 0x73,
+  0x69, 0x6e, 0x67, 0x20, 0x64, 0x65, 0x66, 0x61, 0x75, 0x6c, 0x74, 0x20,
+  0x74, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x3c, 0x2f, 0x62, 0x75,
+  0x74, 0x74, 0x6f, 0x6e, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x60, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x68, 0x74, 0x6d, 0x6c,
+  0x60, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x3c, 0x62, 0x75, 0x74, 0x74, 0x6f, 0x6e, 0x20, 0x6f, 0x6e, 0x63, 0x6c,
+  0x69, 0x63, 0x6b, 0x3d, 0x24, 0x7b, 0x75, 0x73, 0x65, 0x72, 0x54, 0x65,
+  0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x52, 0x65, 0x73, 0x65, 0x74, 0x7d,
+  0x3e, 0x52, 0x65, 0x73, 0x65, 0x74, 0x20, 0x61, 0x6c, 0x6c, 0x20, 0x74,
+  0x6f, 0x20, 0x64, 0x65, 0x66, 0x61, 0x75, 0x6c, 0x74, 0x3c, 0x2f, 0x62,
+  0x75, 0x74, 0x74, 0x6f, 0x6e, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x60, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d,
+  0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x75, 0x73, 0x65,
+  0x45, 0x66, 0x66, 0x65, 0x63, 0x74, 0x28, 0x28, 0x29, 0x20, 0x3d, 0x3e,
+  0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2f,
+  0x2f, 0x20, 0x61, 0x75, 0x74, 0x6f, 0x73, 0x61, 0x76, 0x65, 0x20, 0x74,
+  0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x20, 0x6f, 0x6e, 0x20, 0x65,
+  0x76, 0x65, 0x72, 0x79, 0x20, 0x63, 0x68, 0x61, 0x6e, 0x67, 0x65, 0x0a,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x75, 0x73, 0x65, 0x72,
+  0x54, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x41, 0x75, 0x74, 0x6f,
+  0x73, 0x61, 0x76, 0x65, 0x28, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x7d, 0x2c, 0x20, 0x5b, 0x73, 0x65, 0x73, 0x73, 0x69, 0x6f, 0x6e,
+  0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2c, 0x20, 0x70, 0x61, 0x72, 0x61,
+  0x6d, 0x73, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x5d, 0x29, 0x0a, 0x0a,
   0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20,
-  0x66, 0x6f, 0x75, 0x6e, 0x64, 0x20, 0x3d, 0x20, 0x70, 0x72, 0x6f, 0x62,
-  0x73, 0x2e, 0x66, 0x69, 0x6e, 0x64, 0x28, 0x70, 0x20, 0x3d, 0x3e, 0x20,
-  0x70, 0x2e, 0x74, 0x6f, 0x6b, 0x5f, 0x73, 0x74, 0x72, 0x20, 0x3d, 0x3d,
-  0x3d, 0x20, 0x6d, 0x73, 0x67, 0x2e, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e,
-  0x74, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63,
-  0x6f, 0x6e, 0x73, 0x74, 0x20, 0x70, 0x43, 0x6f, 0x6c, 0x6f, 0x72, 0x20,
-  0x3d, 0x20, 0x66, 0x6f, 0x75, 0x6e, 0x64, 0x20, 0x3f, 0x20, 0x70, 0x72,
-  0x6f, 0x62, 0x43, 0x6f, 0x6c, 0x6f, 0x72, 0x28, 0x66, 0x6f, 0x75, 0x6e,
-  0x64, 0x2e, 0x70, 0x72, 0x6f, 0x62, 0x29, 0x20, 0x3a, 0x20, 0x27, 0x74,
-  0x72, 0x61, 0x6e, 0x73, 0x70, 0x61, 0x72, 0x65, 0x6e, 0x74, 0x27, 0x0a,
-  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e,
-  0x73, 0x74, 0x20, 0x70, 0x6f, 0x70, 0x6f, 0x76, 0x65, 0x72, 0x43, 0x68,
-  0x69, 0x6c, 0x64, 0x72, 0x65, 0x6e, 0x20, 0x3d, 0x20, 0x68, 0x74, 0x6d,
+  0x47, 0x72, 0x61, 0x6d, 0x6d, 0x61, 0x72, 0x43, 0x6f, 0x6e, 0x74, 0x72,
+  0x6f, 0x6c, 0x20, 0x3d, 0x20, 0x28, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x28,
+  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x68, 0x74, 0x6d,
   0x6c, 0x60, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x3c, 0x64, 0x69, 0x76, 0x20, 0x63, 0x6c, 0x61, 0x73, 0x73, 0x3d,
-  0x22, 0x70, 0x72, 0x6f, 0x62, 0x2d, 0x73, 0x65, 0x74, 0x22, 0x3e, 0x0a,
+  0x20, 0x3c, 0x64, 0x69, 0x76, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x6c, 0x61, 0x62, 0x65,
+  0x6c, 0x20, 0x66, 0x6f, 0x72, 0x3d, 0x22, 0x74, 0x65, 0x6d, 0x70, 0x6c,
+  0x61, 0x74, 0x65, 0x22, 0x3e, 0x47, 0x72, 0x61, 0x6d, 0x6d, 0x61, 0x72,
+  0x3c, 0x2f, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x3e, 0x0a, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x74, 0x65,
+  0x78, 0x74, 0x61, 0x72, 0x65, 0x61, 0x20, 0x69, 0x64, 0x3d, 0x22, 0x67,
+  0x72, 0x61, 0x6d, 0x6d, 0x61, 0x72, 0x22, 0x20, 0x6e, 0x61, 0x6d, 0x65,
+  0x3d, 0x22, 0x67, 0x72, 0x61, 0x6d, 0x6d, 0x61, 0x72, 0x22, 0x20, 0x70,
+  0x6c, 0x61, 0x63, 0x65, 0x68, 0x6f, 0x6c, 0x64, 0x65, 0x72, 0x3d, 0x22,
+  0x55, 0x73, 0x65, 0x20, 0x67, 0x62, 0x6e, 0x66, 0x20, 0x6f, 0x72, 0x20,
+  0x4a, 0x53, 0x4f, 0x4e, 0x20, 0x53, 0x63, 0x68, 0x65, 0x6d, 0x61, 0x2b,
+  0x63, 0x6f, 0x6e, 0x76, 0x65, 0x72, 0x74, 0x22, 0x20, 0x76, 0x61, 0x6c,
+  0x75, 0x65, 0x3d, 0x22, 0x24, 0x7b, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73,
+  0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e, 0x67, 0x72, 0x61, 0x6d, 0x6d,
+  0x61, 0x72, 0x7d, 0x22, 0x20, 0x72, 0x6f, 0x77, 0x73, 0x3d, 0x34, 0x20,
+  0x6f, 0x6e, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x3d, 0x24, 0x7b, 0x75, 0x70,
+  0x64, 0x61, 0x74, 0x65, 0x50, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x7d, 0x2f,
+  0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x3c, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x20, 0x74, 0x79, 0x70,
+  0x65, 0x3d, 0x22, 0x74, 0x65, 0x78, 0x74, 0x22, 0x20, 0x6e, 0x61, 0x6d,
+  0x65, 0x3d, 0x22, 0x70, 0x72, 0x6f, 0x70, 0x2d, 0x6f, 0x72, 0x64, 0x65,
+  0x72, 0x22, 0x20, 0x70, 0x6c, 0x61, 0x63, 0x65, 0x68, 0x6f, 0x6c, 0x64,
+  0x65, 0x72, 0x3d, 0x22, 0x6f, 0x72, 0x64, 0x65, 0x72, 0x3a, 0x20, 0x70,
+  0x72, 0x6f, 0x70, 0x31, 0x2c, 0x70, 0x72, 0x6f, 0x70, 0x32, 0x2c, 0x70,
+  0x72, 0x6f, 0x70, 0x33, 0x22, 0x20, 0x6f, 0x6e, 0x69, 0x6e, 0x70, 0x75,
+  0x74, 0x3d, 0x24, 0x7b, 0x75, 0x70, 0x64, 0x61, 0x74, 0x65, 0x47, 0x72,
+  0x61, 0x6d, 0x6d, 0x61, 0x72, 0x4a, 0x73, 0x6f, 0x6e, 0x53, 0x63, 0x68,
+  0x65, 0x6d, 0x61, 0x50, 0x72, 0x6f, 0x70, 0x4f, 0x72, 0x64, 0x65, 0x72,
+  0x7d, 0x20, 0x2f, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x62, 0x75, 0x74, 0x74, 0x6f, 0x6e,
+  0x20, 0x74, 0x79, 0x70, 0x65, 0x3d, 0x22, 0x62, 0x75, 0x74, 0x74, 0x6f,
+  0x6e, 0x22, 0x20, 0x6f, 0x6e, 0x63, 0x6c, 0x69, 0x63, 0x6b, 0x3d, 0x24,
+  0x7b, 0x63, 0x6f, 0x6e, 0x76, 0x65, 0x72, 0x74, 0x4a, 0x53, 0x4f, 0x4e,
+  0x53, 0x63, 0x68, 0x65, 0x6d, 0x61, 0x47, 0x72, 0x61, 0x6d, 0x6d, 0x61,
+  0x72, 0x7d, 0x3e, 0x43, 0x6f, 0x6e, 0x76, 0x65, 0x72, 0x74, 0x20, 0x4a,
+  0x53, 0x4f, 0x4e, 0x20, 0x53, 0x63, 0x68, 0x65, 0x6d, 0x61, 0x3c, 0x2f,
+  0x62, 0x75, 0x74, 0x74, 0x6f, 0x6e, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x2f, 0x64, 0x69, 0x76, 0x3e,
+  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x60,
+  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x29, 0x3b, 0x0a, 0x0a, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x50,
+  0x72, 0x6f, 0x6d, 0x70, 0x74, 0x43, 0x6f, 0x6e, 0x74, 0x72, 0x6f, 0x6c,
+  0x46, 0x69, 0x65, 0x6c, 0x64, 0x53, 0x65, 0x74, 0x20, 0x3d, 0x20, 0x28,
+  0x29, 0x20, 0x3d, 0x3e, 0x20, 0x28, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x68, 0x74, 0x6d, 0x6c, 0x60, 0x0a, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x66, 0x69, 0x65, 0x6c, 0x64, 0x73,
+  0x65, 0x74, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x3c, 0x64, 0x69, 0x76, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x6c, 0x61, 0x62,
+  0x65, 0x6c, 0x20, 0x68, 0x74, 0x6d, 0x6c, 0x46, 0x6f, 0x72, 0x3d, 0x22,
+  0x70, 0x72, 0x6f, 0x6d, 0x70, 0x74, 0x22, 0x3e, 0x50, 0x72, 0x6f, 0x6d,
+  0x70, 0x74, 0x3c, 0x2f, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x3e, 0x0a, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c,
+  0x74, 0x65, 0x78, 0x74, 0x61, 0x72, 0x65, 0x61, 0x20, 0x74, 0x79, 0x70,
+  0x65, 0x3d, 0x22, 0x74, 0x65, 0x78, 0x74, 0x22, 0x20, 0x6e, 0x61, 0x6d,
+  0x65, 0x3d, 0x22, 0x70, 0x72, 0x6f, 0x6d, 0x70, 0x74, 0x22, 0x20, 0x76,
+  0x61, 0x6c, 0x75, 0x65, 0x3d, 0x22, 0x24, 0x7b, 0x73, 0x65, 0x73, 0x73,
+  0x69, 0x6f, 0x6e, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e, 0x70, 0x72,
+  0x6f, 0x6d, 0x70, 0x74, 0x7d, 0x22, 0x20, 0x6f, 0x6e, 0x69, 0x6e, 0x70,
+  0x75, 0x74, 0x3d, 0x24, 0x7b, 0x75, 0x70, 0x64, 0x61, 0x74, 0x65, 0x53,
+  0x65, 0x73, 0x73, 0x69, 0x6f, 0x6e, 0x7d, 0x2f, 0x3e, 0x0a, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x2f, 0x64, 0x69,
+  0x76, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c,
+  0x2f, 0x66, 0x69, 0x65, 0x6c, 0x64, 0x73, 0x65, 0x74, 0x3e, 0x0a, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x60, 0x0a, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x29, 0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x43, 0x68, 0x61, 0x74, 0x43,
+  0x6f, 0x6e, 0x66, 0x69, 0x67, 0x46, 0x6f, 0x72, 0x6d, 0x20, 0x3d, 0x20,
+  0x28, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x28, 0x0a, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x68, 0x74, 0x6d, 0x6c, 0x60, 0x0a, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x24, 0x7b, 0x50, 0x72,
+  0x6f, 0x6d, 0x70, 0x74, 0x43, 0x6f, 0x6e, 0x74, 0x72, 0x6f, 0x6c, 0x46,
+  0x69, 0x65, 0x6c, 0x64, 0x53, 0x65, 0x74, 0x28, 0x29, 0x7d, 0x0a, 0x0a,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x66,
+  0x69, 0x65, 0x6c, 0x64, 0x73, 0x65, 0x74, 0x20, 0x63, 0x6c, 0x61, 0x73,
+  0x73, 0x3d, 0x22, 0x74, 0x77, 0x6f, 0x22, 0x3e, 0x0a, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x64, 0x69,
+  0x76, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x20,
+  0x66, 0x6f, 0x72, 0x3d, 0x22, 0x75, 0x73, 0x65, 0x72, 0x22, 0x3e, 0x55,
+  0x73, 0x65, 0x72, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x3c, 0x2f, 0x6c, 0x61,
+  0x62, 0x65, 0x6c, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x69, 0x6e, 0x70, 0x75,
+  0x74, 0x20, 0x74, 0x79, 0x70, 0x65, 0x3d, 0x22, 0x74, 0x65, 0x78, 0x74,
+  0x22, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x3d, 0x22, 0x75, 0x73, 0x65, 0x72,
+  0x22, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x3d, 0x22, 0x24, 0x7b, 0x73,
+  0x65, 0x73, 0x73, 0x69, 0x6f, 0x6e, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65,
+  0x2e, 0x75, 0x73, 0x65, 0x72, 0x7d, 0x22, 0x20, 0x6f, 0x6e, 0x69, 0x6e,
+  0x70, 0x75, 0x74, 0x3d, 0x24, 0x7b, 0x75, 0x70, 0x64, 0x61, 0x74, 0x65,
+  0x53, 0x65, 0x73, 0x73, 0x69, 0x6f, 0x6e, 0x7d, 0x20, 0x2f, 0x3e, 0x0a,
   0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x24, 0x7b, 0x70, 0x72, 0x6f, 0x62, 0x73, 0x2e, 0x6d, 0x61, 0x70, 0x28,
-  0x28, 0x70, 0x2c, 0x20, 0x69, 0x6e, 0x64, 0x65, 0x78, 0x29, 0x20, 0x3d,
-  0x3e, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x3c, 0x2f, 0x64, 0x69, 0x76, 0x3e, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x64, 0x69, 0x76,
+  0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x3c, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x20, 0x66,
+  0x6f, 0x72, 0x3d, 0x22, 0x62, 0x6f, 0x74, 0x22, 0x3e, 0x42, 0x6f, 0x74,
+  0x20, 0x6e, 0x61, 0x6d, 0x65, 0x3c, 0x2f, 0x6c, 0x61, 0x62, 0x65, 0x6c,
+  0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x3c, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x20, 0x74,
+  0x79, 0x70, 0x65, 0x3d, 0x22, 0x74, 0x65, 0x78, 0x74, 0x22, 0x20, 0x6e,
+  0x61, 0x6d, 0x65, 0x3d, 0x22, 0x63, 0x68, 0x61, 0x72, 0x22, 0x20, 0x76,
+  0x61, 0x6c, 0x75, 0x65, 0x3d, 0x22, 0x24, 0x7b, 0x73, 0x65, 0x73, 0x73,
+  0x69, 0x6f, 0x6e, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e, 0x63, 0x68,
+  0x61, 0x72, 0x7d, 0x22, 0x20, 0x6f, 0x6e, 0x69, 0x6e, 0x70, 0x75, 0x74,
+  0x3d, 0x24, 0x7b, 0x75, 0x70, 0x64, 0x61, 0x74, 0x65, 0x53, 0x65, 0x73,
+  0x73, 0x69, 0x6f, 0x6e, 0x7d, 0x20, 0x2f, 0x3e, 0x0a, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x2f, 0x64,
+  0x69, 0x76, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x3c, 0x2f, 0x66, 0x69, 0x65, 0x6c, 0x64, 0x73, 0x65, 0x74,
+  0x3e, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x3c, 0x66, 0x69, 0x65, 0x6c, 0x64, 0x73, 0x65, 0x74, 0x3e, 0x0a,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x3c, 0x64, 0x69, 0x76, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x6c, 0x61, 0x62,
+  0x65, 0x6c, 0x20, 0x66, 0x6f, 0x72, 0x3d, 0x22, 0x74, 0x65, 0x6d, 0x70,
+  0x6c, 0x61, 0x74, 0x65, 0x22, 0x3e, 0x50, 0x72, 0x6f, 0x6d, 0x70, 0x74,
+  0x20, 0x74, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x3c, 0x2f, 0x6c,
+  0x61, 0x62, 0x65, 0x6c, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x74, 0x65, 0x78,
+  0x74, 0x61, 0x72, 0x65, 0x61, 0x20, 0x69, 0x64, 0x3d, 0x22, 0x74, 0x65,
+  0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x22, 0x20, 0x6e, 0x61, 0x6d, 0x65,
+  0x3d, 0x22, 0x74, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x22, 0x20,
+  0x76, 0x61, 0x6c, 0x75, 0x65, 0x3d, 0x22, 0x24, 0x7b, 0x73, 0x65, 0x73,
+  0x73, 0x69, 0x6f, 0x6e, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e, 0x74,
+  0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x7d, 0x22, 0x20, 0x72, 0x6f,
+  0x77, 0x73, 0x3d, 0x34, 0x20, 0x6f, 0x6e, 0x69, 0x6e, 0x70, 0x75, 0x74,
+  0x3d, 0x24, 0x7b, 0x75, 0x70, 0x64, 0x61, 0x74, 0x65, 0x53, 0x65, 0x73,
+  0x73, 0x69, 0x6f, 0x6e, 0x7d, 0x2f, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x2f, 0x64, 0x69,
+  0x76, 0x3e, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x3c, 0x64, 0x69, 0x76, 0x3e, 0x0a, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x3c, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x20, 0x66, 0x6f, 0x72, 0x3d, 0x22,
+  0x74, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x22, 0x3e, 0x43, 0x68,
+  0x61, 0x74, 0x20, 0x68, 0x69, 0x73, 0x74, 0x6f, 0x72, 0x79, 0x20, 0x74,
+  0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x3c, 0x2f, 0x6c, 0x61, 0x62,
+  0x65, 0x6c, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x74, 0x65, 0x78, 0x74, 0x61,
+  0x72, 0x65, 0x61, 0x20, 0x69, 0x64, 0x3d, 0x22, 0x74, 0x65, 0x6d, 0x70,
+  0x6c, 0x61, 0x74, 0x65, 0x22, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x3d, 0x22,
+  0x68, 0x69, 0x73, 0x74, 0x6f, 0x72, 0x79, 0x54, 0x65, 0x6d, 0x70, 0x6c,
+  0x61, 0x74, 0x65, 0x22, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x3d, 0x22,
+  0x24, 0x7b, 0x73, 0x65, 0x73, 0x73, 0x69, 0x6f, 0x6e, 0x2e, 0x76, 0x61,
+  0x6c, 0x75, 0x65, 0x2e, 0x68, 0x69, 0x73, 0x74, 0x6f, 0x72, 0x79, 0x54,
+  0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x7d, 0x22, 0x20, 0x72, 0x6f,
+  0x77, 0x73, 0x3d, 0x31, 0x20, 0x6f, 0x6e, 0x69, 0x6e, 0x70, 0x75, 0x74,
+  0x3d, 0x24, 0x7b, 0x75, 0x70, 0x64, 0x61, 0x74, 0x65, 0x53, 0x65, 0x73,
+  0x73, 0x69, 0x6f, 0x6e, 0x7d, 0x2f, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x2f, 0x64, 0x69,
+  0x76, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x24, 0x7b, 0x47, 0x72, 0x61, 0x6d, 0x6d, 0x61, 0x72,
+  0x43, 0x6f, 0x6e, 0x74, 0x72, 0x6f, 0x6c, 0x28, 0x29, 0x7d, 0x0a, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x2f, 0x66,
+  0x69, 0x65, 0x6c, 0x64, 0x73, 0x65, 0x74, 0x3e, 0x0a, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x60, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x29,
+  0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e,
+  0x73, 0x74, 0x20, 0x43, 0x6f, 0x6d, 0x70, 0x6c, 0x65, 0x74, 0x69, 0x6f,
+  0x6e, 0x43, 0x6f, 0x6e, 0x66, 0x69, 0x67, 0x46, 0x6f, 0x72, 0x6d, 0x20,
+  0x3d, 0x20, 0x28, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x28, 0x0a, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x68, 0x74, 0x6d, 0x6c, 0x60, 0x0a,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x24, 0x7b,
+  0x50, 0x72, 0x6f, 0x6d, 0x70, 0x74, 0x43, 0x6f, 0x6e, 0x74, 0x72, 0x6f,
+  0x6c, 0x46, 0x69, 0x65, 0x6c, 0x64, 0x53, 0x65, 0x74, 0x28, 0x29, 0x7d,
+  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c,
+  0x66, 0x69, 0x65, 0x6c, 0x64, 0x73, 0x65, 0x74, 0x3e, 0x24, 0x7b, 0x47,
+  0x72, 0x61, 0x6d, 0x6d, 0x61, 0x72, 0x43, 0x6f, 0x6e, 0x74, 0x72, 0x6f,
+  0x6c, 0x28, 0x29, 0x7d, 0x3c, 0x2f, 0x66, 0x69, 0x65, 0x6c, 0x64, 0x73,
+  0x65, 0x74, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x60, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x29, 0x3b, 0x0a, 0x0a,
   0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e,
   0x20, 0x68, 0x74, 0x6d, 0x6c, 0x60, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x3c, 0x66, 0x6f, 0x72, 0x6d, 0x3e, 0x0a, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x66, 0x69, 0x65,
+  0x6c, 0x64, 0x73, 0x65, 0x74, 0x20, 0x63, 0x6c, 0x61, 0x73, 0x73, 0x3d,
+  0x22, 0x74, 0x77, 0x6f, 0x22, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x24, 0x7b, 0x55, 0x73,
+  0x65, 0x72, 0x54, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x52, 0x65,
+  0x73, 0x65, 0x74, 0x42, 0x75, 0x74, 0x74, 0x6f, 0x6e, 0x7d, 0x2f, 0x3e,
+  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x3c, 0x64, 0x69, 0x76, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x6c, 0x61,
+  0x62, 0x65, 0x6c, 0x20, 0x63, 0x6c, 0x61, 0x73, 0x73, 0x3d, 0x22, 0x73,
+  0x6c, 0x69, 0x6d, 0x22, 0x3e, 0x3c, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x20,
+  0x74, 0x79, 0x70, 0x65, 0x3d, 0x22, 0x72, 0x61, 0x64, 0x69, 0x6f, 0x22,
+  0x20, 0x6e, 0x61, 0x6d, 0x65, 0x3d, 0x22, 0x74, 0x79, 0x70, 0x65, 0x22,
+  0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x3d, 0x22, 0x63, 0x68, 0x61, 0x74,
+  0x22, 0x20, 0x63, 0x68, 0x65, 0x63, 0x6b, 0x65, 0x64, 0x3d, 0x24, 0x7b,
+  0x73, 0x65, 0x73, 0x73, 0x69, 0x6f, 0x6e, 0x2e, 0x76, 0x61, 0x6c, 0x75,
+  0x65, 0x2e, 0x74, 0x79, 0x70, 0x65, 0x20, 0x3d, 0x3d, 0x3d, 0x20, 0x22,
+  0x63, 0x68, 0x61, 0x74, 0x22, 0x7d, 0x20, 0x6f, 0x6e, 0x69, 0x6e, 0x70,
+  0x75, 0x74, 0x3d, 0x24, 0x7b, 0x75, 0x70, 0x64, 0x61, 0x74, 0x65, 0x53,
+  0x65, 0x73, 0x73, 0x69, 0x6f, 0x6e, 0x7d, 0x20, 0x2f, 0x3e, 0x20, 0x43,
+  0x68, 0x61, 0x74, 0x3c, 0x2f, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x3e, 0x0a,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x3c, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x20, 0x63, 0x6c, 0x61,
+  0x73, 0x73, 0x3d, 0x22, 0x73, 0x6c, 0x69, 0x6d, 0x22, 0x3e, 0x3c, 0x69,
+  0x6e, 0x70, 0x75, 0x74, 0x20, 0x74, 0x79, 0x70, 0x65, 0x3d, 0x22, 0x72,
+  0x61, 0x64, 0x69, 0x6f, 0x22, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x3d, 0x22,
+  0x74, 0x79, 0x70, 0x65, 0x22, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x3d,
+  0x22, 0x63, 0x6f, 0x6d, 0x70, 0x6c, 0x65, 0x74, 0x69, 0x6f, 0x6e, 0x22,
+  0x20, 0x63, 0x68, 0x65, 0x63, 0x6b, 0x65, 0x64, 0x3d, 0x24, 0x7b, 0x73,
+  0x65, 0x73, 0x73, 0x69, 0x6f, 0x6e, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65,
+  0x2e, 0x74, 0x79, 0x70, 0x65, 0x20, 0x3d, 0x3d, 0x3d, 0x20, 0x22, 0x63,
+  0x6f, 0x6d, 0x70, 0x6c, 0x65, 0x74, 0x69, 0x6f, 0x6e, 0x22, 0x7d, 0x20,
+  0x6f, 0x6e, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x3d, 0x24, 0x7b, 0x75, 0x70,
+  0x64, 0x61, 0x74, 0x65, 0x53, 0x65, 0x73, 0x73, 0x69, 0x6f, 0x6e, 0x7d,
+  0x20, 0x2f, 0x3e, 0x20, 0x43, 0x6f, 0x6d, 0x70, 0x6c, 0x65, 0x74, 0x69,
+  0x6f, 0x6e, 0x3c, 0x2f, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x3e, 0x0a, 0x20,
   0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c,
-  0x64, 0x69, 0x76, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6b, 0x65,
-  0x79, 0x3d, 0x24, 0x7b, 0x69, 0x6e, 0x64, 0x65, 0x78, 0x7d, 0x0a, 0x20,
+  0x2f, 0x64, 0x69, 0x76, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x3c, 0x2f, 0x66, 0x69, 0x65, 0x6c, 0x64, 0x73,
+  0x65, 0x74, 0x3e, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x24, 0x7b, 0x73, 0x65, 0x73, 0x73, 0x69, 0x6f, 0x6e,
+  0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e, 0x74, 0x79, 0x70, 0x65, 0x20,
+  0x3d, 0x3d, 0x3d, 0x20, 0x27, 0x63, 0x68, 0x61, 0x74, 0x27, 0x20, 0x3f,
+  0x20, 0x43, 0x68, 0x61, 0x74, 0x43, 0x6f, 0x6e, 0x66, 0x69, 0x67, 0x46,
+  0x6f, 0x72, 0x6d, 0x28, 0x29, 0x20, 0x3a, 0x20, 0x43, 0x6f, 0x6d, 0x70,
+  0x6c, 0x65, 0x74, 0x69, 0x6f, 0x6e, 0x43, 0x6f, 0x6e, 0x66, 0x69, 0x67,
+  0x46, 0x6f, 0x72, 0x6d, 0x28, 0x29, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x66, 0x69, 0x65, 0x6c,
+  0x64, 0x73, 0x65, 0x74, 0x20, 0x63, 0x6c, 0x61, 0x73, 0x73, 0x3d, 0x22,
+  0x74, 0x77, 0x6f, 0x22, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x24, 0x7b, 0x49, 0x6e, 0x74, 0x46,
+  0x69, 0x65, 0x6c, 0x64, 0x28, 0x7b, 0x20, 0x6c, 0x61, 0x62, 0x65, 0x6c,
+  0x3a, 0x20, 0x22, 0x50, 0x72, 0x65, 0x64, 0x69, 0x63, 0x74, 0x69, 0x6f,
+  0x6e, 0x73, 0x22, 0x2c, 0x20, 0x6d, 0x61, 0x78, 0x3a, 0x20, 0x32, 0x30,
+  0x34, 0x38, 0x2c, 0x20, 0x6d, 0x69, 0x6e, 0x3a, 0x20, 0x2d, 0x31, 0x2c,
+  0x20, 0x6e, 0x61, 0x6d, 0x65, 0x3a, 0x20, 0x22, 0x6e, 0x5f, 0x70, 0x72,
+  0x65, 0x64, 0x69, 0x63, 0x74, 0x22, 0x2c, 0x20, 0x76, 0x61, 0x6c, 0x75,
+  0x65, 0x3a, 0x20, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2e, 0x76, 0x61,
+  0x6c, 0x75, 0x65, 0x2e, 0x6e, 0x5f, 0x70, 0x72, 0x65, 0x64, 0x69, 0x63,
+  0x74, 0x20, 0x7d, 0x29, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x24, 0x7b, 0x46, 0x6c, 0x6f, 0x61,
+  0x74, 0x46, 0x69, 0x65, 0x6c, 0x64, 0x28, 0x7b, 0x20, 0x6c, 0x61, 0x62,
+  0x65, 0x6c, 0x3a, 0x20, 0x22, 0x54, 0x65, 0x6d, 0x70, 0x65, 0x72, 0x61,
+  0x74, 0x75, 0x72, 0x65, 0x22, 0x2c, 0x20, 0x6d, 0x61, 0x78, 0x3a, 0x20,
+  0x31, 0x2e, 0x35, 0x2c, 0x20, 0x6d, 0x69, 0x6e, 0x3a, 0x20, 0x30, 0x2e,
+  0x30, 0x2c, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x3a, 0x20, 0x22, 0x74, 0x65,
+  0x6d, 0x70, 0x65, 0x72, 0x61, 0x74, 0x75, 0x72, 0x65, 0x22, 0x2c, 0x20,
+  0x73, 0x74, 0x65, 0x70, 0x3a, 0x20, 0x30, 0x2e, 0x30, 0x31, 0x2c, 0x20,
+  0x76, 0x61, 0x6c, 0x75, 0x65, 0x3a, 0x20, 0x70, 0x61, 0x72, 0x61, 0x6d,
+  0x73, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e, 0x74, 0x65, 0x6d, 0x70,
+  0x65, 0x72, 0x61, 0x74, 0x75, 0x72, 0x65, 0x20, 0x7d, 0x29, 0x7d, 0x0a,
   0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x74, 0x69, 0x74, 0x6c, 0x65, 0x3d, 0x24,
-  0x7b, 0x60, 0x70, 0x72, 0x6f, 0x62, 0x3a, 0x20, 0x24, 0x7b, 0x70, 0x2e,
-  0x70, 0x72, 0x6f, 0x62, 0x7d, 0x60, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20,
+  0x24, 0x7b, 0x46, 0x6c, 0x6f, 0x61, 0x74, 0x46, 0x69, 0x65, 0x6c, 0x64,
+  0x28, 0x7b, 0x20, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x3a, 0x20, 0x22, 0x50,
+  0x65, 0x6e, 0x61, 0x6c, 0x69, 0x7a, 0x65, 0x20, 0x72, 0x65, 0x70, 0x65,
+  0x61, 0x74, 0x20, 0x73, 0x65, 0x71, 0x75, 0x65, 0x6e, 0x63, 0x65, 0x22,
+  0x2c, 0x20, 0x6d, 0x61, 0x78, 0x3a, 0x20, 0x32, 0x2e, 0x30, 0x2c, 0x20,
+  0x6d, 0x69, 0x6e, 0x3a, 0x20, 0x30, 0x2e, 0x30, 0x2c, 0x20, 0x6e, 0x61,
+  0x6d, 0x65, 0x3a, 0x20, 0x22, 0x72, 0x65, 0x70, 0x65, 0x61, 0x74, 0x5f,
+  0x70, 0x65, 0x6e, 0x61, 0x6c, 0x74, 0x79, 0x22, 0x2c, 0x20, 0x73, 0x74,
+  0x65, 0x70, 0x3a, 0x20, 0x30, 0x2e, 0x30, 0x31, 0x2c, 0x20, 0x76, 0x61,
+  0x6c, 0x75, 0x65, 0x3a, 0x20, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2e,
+  0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e, 0x72, 0x65, 0x70, 0x65, 0x61, 0x74,
+  0x5f, 0x70, 0x65, 0x6e, 0x61, 0x6c, 0x74, 0x79, 0x20, 0x7d, 0x29, 0x7d,
+  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x24, 0x7b, 0x49, 0x6e, 0x74, 0x46, 0x69, 0x65, 0x6c, 0x64, 0x28,
+  0x7b, 0x20, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x3a, 0x20, 0x22, 0x43, 0x6f,
+  0x6e, 0x73, 0x69, 0x64, 0x65, 0x72, 0x20, 0x4e, 0x20, 0x74, 0x6f, 0x6b,
+  0x65, 0x6e, 0x73, 0x20, 0x66, 0x6f, 0x72, 0x20, 0x70, 0x65, 0x6e, 0x61,
+  0x6c, 0x69, 0x7a, 0x65, 0x22, 0x2c, 0x20, 0x6d, 0x61, 0x78, 0x3a, 0x20,
+  0x32, 0x30, 0x34, 0x38, 0x2c, 0x20, 0x6d, 0x69, 0x6e, 0x3a, 0x20, 0x30,
+  0x2c, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x3a, 0x20, 0x22, 0x72, 0x65, 0x70,
+  0x65, 0x61, 0x74, 0x5f, 0x6c, 0x61, 0x73, 0x74, 0x5f, 0x6e, 0x22, 0x2c,
+  0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x3a, 0x20, 0x70, 0x61, 0x72, 0x61,
+  0x6d, 0x73, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e, 0x72, 0x65, 0x70,
+  0x65, 0x61, 0x74, 0x5f, 0x6c, 0x61, 0x73, 0x74, 0x5f, 0x6e, 0x20, 0x7d,
+  0x29, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x24, 0x7b, 0x49, 0x6e, 0x74, 0x46, 0x69, 0x65, 0x6c,
+  0x64, 0x28, 0x7b, 0x20, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x3a, 0x20, 0x22,
+  0x54, 0x6f, 0x70, 0x2d, 0x4b, 0x20, 0x73, 0x61, 0x6d, 0x70, 0x6c, 0x69,
+  0x6e, 0x67, 0x22, 0x2c, 0x20, 0x6d, 0x61, 0x78, 0x3a, 0x20, 0x31, 0x30,
+  0x30, 0x2c, 0x20, 0x6d, 0x69, 0x6e, 0x3a, 0x20, 0x2d, 0x31, 0x2c, 0x20,
+  0x6e, 0x61, 0x6d, 0x65, 0x3a, 0x20, 0x22, 0x74, 0x6f, 0x70, 0x5f, 0x6b,
+  0x22, 0x2c, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x3a, 0x20, 0x70, 0x61,
+  0x72, 0x61, 0x6d, 0x73, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e, 0x74,
+  0x6f, 0x70, 0x5f, 0x6b, 0x20, 0x7d, 0x29, 0x7d, 0x0a, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x24, 0x7b, 0x46,
+  0x6c, 0x6f, 0x61, 0x74, 0x46, 0x69, 0x65, 0x6c, 0x64, 0x28, 0x7b, 0x20,
+  0x6c, 0x61, 0x62, 0x65, 0x6c, 0x3a, 0x20, 0x22, 0x54, 0x6f, 0x70, 0x2d,
+  0x50, 0x20, 0x73, 0x61, 0x6d, 0x70, 0x6c, 0x69, 0x6e, 0x67, 0x22, 0x2c,
+  0x20, 0x6d, 0x61, 0x78, 0x3a, 0x20, 0x31, 0x2e, 0x30, 0x2c, 0x20, 0x6d,
+  0x69, 0x6e, 0x3a, 0x20, 0x30, 0x2e, 0x30, 0x2c, 0x20, 0x6e, 0x61, 0x6d,
+  0x65, 0x3a, 0x20, 0x22, 0x74, 0x6f, 0x70, 0x5f, 0x70, 0x22, 0x2c, 0x20,
+  0x73, 0x74, 0x65, 0x70, 0x3a, 0x20, 0x30, 0x2e, 0x30, 0x31, 0x2c, 0x20,
+  0x76, 0x61, 0x6c, 0x75, 0x65, 0x3a, 0x20, 0x70, 0x61, 0x72, 0x61, 0x6d,
+  0x73, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e, 0x74, 0x6f, 0x70, 0x5f,
+  0x70, 0x20, 0x7d, 0x29, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x3c, 0x2f, 0x66, 0x69, 0x65, 0x6c, 0x64, 0x73,
+  0x65, 0x74, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x3c, 0x64, 0x65, 0x74, 0x61, 0x69, 0x6c, 0x73, 0x3e, 0x0a,
   0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x73, 0x74, 0x79, 0x6c, 0x65, 0x3d, 0x24, 0x7b, 0x7b, 0x0a,
+  0x3c, 0x73, 0x75, 0x6d, 0x6d, 0x61, 0x72, 0x79, 0x3e, 0x4d, 0x6f, 0x72,
+  0x65, 0x20, 0x6f, 0x70, 0x74, 0x69, 0x6f, 0x6e, 0x73, 0x3c, 0x2f, 0x73,
+  0x75, 0x6d, 0x6d, 0x61, 0x72, 0x79, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x66, 0x69, 0x65,
+  0x6c, 0x64, 0x73, 0x65, 0x74, 0x20, 0x63, 0x6c, 0x61, 0x73, 0x73, 0x3d,
+  0x22, 0x74, 0x77, 0x6f, 0x22, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x24, 0x7b, 0x46,
+  0x6c, 0x6f, 0x61, 0x74, 0x46, 0x69, 0x65, 0x6c, 0x64, 0x28, 0x7b, 0x20,
+  0x6c, 0x61, 0x62, 0x65, 0x6c, 0x3a, 0x20, 0x22, 0x54, 0x46, 0x53, 0x2d,
+  0x5a, 0x22, 0x2c, 0x20, 0x6d, 0x61, 0x78, 0x3a, 0x20, 0x31, 0x2e, 0x30,
+  0x2c, 0x20, 0x6d, 0x69, 0x6e, 0x3a, 0x20, 0x30, 0x2e, 0x30, 0x2c, 0x20,
+  0x6e, 0x61, 0x6d, 0x65, 0x3a, 0x20, 0x22, 0x74, 0x66, 0x73, 0x5f, 0x7a,
+  0x22, 0x2c, 0x20, 0x73, 0x74, 0x65, 0x70, 0x3a, 0x20, 0x30, 0x2e, 0x30,
+  0x31, 0x2c, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x3a, 0x20, 0x70, 0x61,
+  0x72, 0x61, 0x6d, 0x73, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e, 0x74,
+  0x66, 0x73, 0x5f, 0x7a, 0x20, 0x7d, 0x29, 0x7d, 0x0a, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x24,
+  0x7b, 0x46, 0x6c, 0x6f, 0x61, 0x74, 0x46, 0x69, 0x65, 0x6c, 0x64, 0x28,
+  0x7b, 0x20, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x3a, 0x20, 0x22, 0x54, 0x79,
+  0x70, 0x69, 0x63, 0x61, 0x6c, 0x20, 0x50, 0x22, 0x2c, 0x20, 0x6d, 0x61,
+  0x78, 0x3a, 0x20, 0x31, 0x2e, 0x30, 0x2c, 0x20, 0x6d, 0x69, 0x6e, 0x3a,
+  0x20, 0x30, 0x2e, 0x30, 0x2c, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x3a, 0x20,
+  0x22, 0x74, 0x79, 0x70, 0x69, 0x63, 0x61, 0x6c, 0x5f, 0x70, 0x22, 0x2c,
+  0x20, 0x73, 0x74, 0x65, 0x70, 0x3a, 0x20, 0x30, 0x2e, 0x30, 0x31, 0x2c,
+  0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x3a, 0x20, 0x70, 0x61, 0x72, 0x61,
+  0x6d, 0x73, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e, 0x74, 0x79, 0x70,
+  0x69, 0x63, 0x61, 0x6c, 0x5f, 0x70, 0x20, 0x7d, 0x29, 0x7d, 0x0a, 0x20,
   0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x24, 0x7b, 0x46, 0x6c, 0x6f, 0x61, 0x74, 0x46, 0x69, 0x65, 0x6c,
+  0x64, 0x28, 0x7b, 0x20, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x3a, 0x20, 0x22,
+  0x50, 0x72, 0x65, 0x73, 0x65, 0x6e, 0x63, 0x65, 0x20, 0x70, 0x65, 0x6e,
+  0x61, 0x6c, 0x74, 0x79, 0x22, 0x2c, 0x20, 0x6d, 0x61, 0x78, 0x3a, 0x20,
+  0x31, 0x2e, 0x30, 0x2c, 0x20, 0x6d, 0x69, 0x6e, 0x3a, 0x20, 0x30, 0x2e,
+  0x30, 0x2c, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x3a, 0x20, 0x22, 0x70, 0x72,
+  0x65, 0x73, 0x65, 0x6e, 0x63, 0x65, 0x5f, 0x70, 0x65, 0x6e, 0x61, 0x6c,
+  0x74, 0x79, 0x22, 0x2c, 0x20, 0x73, 0x74, 0x65, 0x70, 0x3a, 0x20, 0x30,
+  0x2e, 0x30, 0x31, 0x2c, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x3a, 0x20,
+  0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65,
+  0x2e, 0x70, 0x72, 0x65, 0x73, 0x65, 0x6e, 0x63, 0x65, 0x5f, 0x70, 0x65,
+  0x6e, 0x61, 0x6c, 0x74, 0x79, 0x20, 0x7d, 0x29, 0x7d, 0x0a, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x24, 0x7b, 0x46, 0x6c, 0x6f, 0x61, 0x74, 0x46, 0x69, 0x65, 0x6c, 0x64,
+  0x28, 0x7b, 0x20, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x3a, 0x20, 0x22, 0x46,
+  0x72, 0x65, 0x71, 0x75, 0x65, 0x6e, 0x63, 0x79, 0x20, 0x70, 0x65, 0x6e,
+  0x61, 0x6c, 0x74, 0x79, 0x22, 0x2c, 0x20, 0x6d, 0x61, 0x78, 0x3a, 0x20,
+  0x31, 0x2e, 0x30, 0x2c, 0x20, 0x6d, 0x69, 0x6e, 0x3a, 0x20, 0x30, 0x2e,
+  0x30, 0x2c, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x3a, 0x20, 0x22, 0x66, 0x72,
+  0x65, 0x71, 0x75, 0x65, 0x6e, 0x63, 0x79, 0x5f, 0x70, 0x65, 0x6e, 0x61,
+  0x6c, 0x74, 0x79, 0x22, 0x2c, 0x20, 0x73, 0x74, 0x65, 0x70, 0x3a, 0x20,
+  0x30, 0x2e, 0x30, 0x31, 0x2c, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x3a,
+  0x20, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2e, 0x76, 0x61, 0x6c, 0x75,
+  0x65, 0x2e, 0x66, 0x72, 0x65, 0x71, 0x75, 0x65, 0x6e, 0x63, 0x79, 0x5f,
+  0x70, 0x65, 0x6e, 0x61, 0x6c, 0x74, 0x79, 0x20, 0x7d, 0x29, 0x7d, 0x0a,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x3c, 0x2f, 0x66, 0x69, 0x65, 0x6c, 0x64, 0x73, 0x65, 0x74, 0x3e, 0x0a,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x3c, 0x68, 0x72, 0x20, 0x2f, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x66, 0x69, 0x65, 0x6c,
+  0x64, 0x73, 0x65, 0x74, 0x20, 0x63, 0x6c, 0x61, 0x73, 0x73, 0x3d, 0x22,
+  0x74, 0x68, 0x72, 0x65, 0x65, 0x22, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x64,
+  0x69, 0x76, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x6c, 0x61, 0x62,
+  0x65, 0x6c, 0x3e, 0x3c, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x20, 0x74, 0x79,
+  0x70, 0x65, 0x3d, 0x22, 0x72, 0x61, 0x64, 0x69, 0x6f, 0x22, 0x20, 0x6e,
+  0x61, 0x6d, 0x65, 0x3d, 0x22, 0x6d, 0x69, 0x72, 0x6f, 0x73, 0x74, 0x61,
+  0x74, 0x22, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x3d, 0x22, 0x30, 0x22,
+  0x20, 0x63, 0x68, 0x65, 0x63, 0x6b, 0x65, 0x64, 0x3d, 0x24, 0x7b, 0x70,
+  0x61, 0x72, 0x61, 0x6d, 0x73, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e,
+  0x6d, 0x69, 0x72, 0x6f, 0x73, 0x74, 0x61, 0x74, 0x20, 0x3d, 0x3d, 0x20,
+  0x30, 0x7d, 0x20, 0x6f, 0x6e, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x3d, 0x24,
+  0x7b, 0x75, 0x70, 0x64, 0x61, 0x74, 0x65, 0x50, 0x61, 0x72, 0x61, 0x6d,
+  0x73, 0x49, 0x6e, 0x74, 0x7d, 0x20, 0x2f, 0x3e, 0x20, 0x6e, 0x6f, 0x20,
+  0x4d, 0x69, 0x72, 0x6f, 0x73, 0x74, 0x61, 0x74, 0x3c, 0x2f, 0x6c, 0x61,
+  0x62, 0x65, 0x6c, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x6c, 0x61,
+  0x62, 0x65, 0x6c, 0x3e, 0x3c, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x20, 0x74,
+  0x79, 0x70, 0x65, 0x3d, 0x22, 0x72, 0x61, 0x64, 0x69, 0x6f, 0x22, 0x20,
+  0x6e, 0x61, 0x6d, 0x65, 0x3d, 0x22, 0x6d, 0x69, 0x72, 0x6f, 0x73, 0x74,
+  0x61, 0x74, 0x22, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x3d, 0x22, 0x31,
+  0x22, 0x20, 0x63, 0x68, 0x65, 0x63, 0x6b, 0x65, 0x64, 0x3d, 0x24, 0x7b,
+  0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65,
+  0x2e, 0x6d, 0x69, 0x72, 0x6f, 0x73, 0x74, 0x61, 0x74, 0x20, 0x3d, 0x3d,
+  0x20, 0x31, 0x7d, 0x20, 0x6f, 0x6e, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x3d,
+  0x24, 0x7b, 0x75, 0x70, 0x64, 0x61, 0x74, 0x65, 0x50, 0x61, 0x72, 0x61,
+  0x6d, 0x73, 0x49, 0x6e, 0x74, 0x7d, 0x20, 0x2f, 0x3e, 0x20, 0x4d, 0x69,
+  0x72, 0x6f, 0x73, 0x74, 0x61, 0x74, 0x20, 0x76, 0x31, 0x3c, 0x2f, 0x6c,
+  0x61, 0x62, 0x65, 0x6c, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x6c,
+  0x61, 0x62, 0x65, 0x6c, 0x3e, 0x3c, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x20,
+  0x74, 0x79, 0x70, 0x65, 0x3d, 0x22, 0x72, 0x61, 0x64, 0x69, 0x6f, 0x22,
+  0x20, 0x6e, 0x61, 0x6d, 0x65, 0x3d, 0x22, 0x6d, 0x69, 0x72, 0x6f, 0x73,
+  0x74, 0x61, 0x74, 0x22, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x3d, 0x22,
+  0x32, 0x22, 0x20, 0x63, 0x68, 0x65, 0x63, 0x6b, 0x65, 0x64, 0x3d, 0x24,
+  0x7b, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2e, 0x76, 0x61, 0x6c, 0x75,
+  0x65, 0x2e, 0x6d, 0x69, 0x72, 0x6f, 0x73, 0x74, 0x61, 0x74, 0x20, 0x3d,
+  0x3d, 0x20, 0x32, 0x7d, 0x20, 0x6f, 0x6e, 0x69, 0x6e, 0x70, 0x75, 0x74,
+  0x3d, 0x24, 0x7b, 0x75, 0x70, 0x64, 0x61, 0x74, 0x65, 0x50, 0x61, 0x72,
+  0x61, 0x6d, 0x73, 0x49, 0x6e, 0x74, 0x7d, 0x20, 0x2f, 0x3e, 0x20, 0x4d,
+  0x69, 0x72, 0x6f, 0x73, 0x74, 0x61, 0x74, 0x20, 0x76, 0x32, 0x3c, 0x2f,
+  0x6c, 0x61, 0x62, 0x65, 0x6c, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x2f, 0x64,
+  0x69, 0x76, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x24, 0x7b, 0x46, 0x6c, 0x6f, 0x61,
+  0x74, 0x46, 0x69, 0x65, 0x6c, 0x64, 0x28, 0x7b, 0x20, 0x6c, 0x61, 0x62,
+  0x65, 0x6c, 0x3a, 0x20, 0x22, 0x4d, 0x69, 0x72, 0x6f, 0x73, 0x74, 0x61,
+  0x74, 0x20, 0x74, 0x61, 0x75, 0x22, 0x2c, 0x20, 0x6d, 0x61, 0x78, 0x3a,
+  0x20, 0x31, 0x30, 0x2e, 0x30, 0x2c, 0x20, 0x6d, 0x69, 0x6e, 0x3a, 0x20,
+  0x30, 0x2e, 0x30, 0x2c, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x3a, 0x20, 0x22,
+  0x6d, 0x69, 0x72, 0x6f, 0x73, 0x74, 0x61, 0x74, 0x5f, 0x74, 0x61, 0x75,
+  0x22, 0x2c, 0x20, 0x73, 0x74, 0x65, 0x70, 0x3a, 0x20, 0x30, 0x2e, 0x30,
+  0x31, 0x2c, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x3a, 0x20, 0x70, 0x61,
+  0x72, 0x61, 0x6d, 0x73, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e, 0x6d,
+  0x69, 0x72, 0x6f, 0x73, 0x74, 0x61, 0x74, 0x5f, 0x74, 0x61, 0x75, 0x20,
+  0x7d, 0x29, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x24, 0x7b, 0x46, 0x6c, 0x6f, 0x61,
+  0x74, 0x46, 0x69, 0x65, 0x6c, 0x64, 0x28, 0x7b, 0x20, 0x6c, 0x61, 0x62,
+  0x65, 0x6c, 0x3a, 0x20, 0x22, 0x4d, 0x69, 0x72, 0x6f, 0x73, 0x74, 0x61,
+  0x74, 0x20, 0x65, 0x74, 0x61, 0x22, 0x2c, 0x20, 0x6d, 0x61, 0x78, 0x3a,
+  0x20, 0x31, 0x2e, 0x30, 0x2c, 0x20, 0x6d, 0x69, 0x6e, 0x3a, 0x20, 0x30,
+  0x2e, 0x30, 0x2c, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x3a, 0x20, 0x22, 0x6d,
+  0x69, 0x72, 0x6f, 0x73, 0x74, 0x61, 0x74, 0x5f, 0x65, 0x74, 0x61, 0x22,
+  0x2c, 0x20, 0x73, 0x74, 0x65, 0x70, 0x3a, 0x20, 0x30, 0x2e, 0x30, 0x31,
+  0x2c, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x3a, 0x20, 0x70, 0x61, 0x72,
+  0x61, 0x6d, 0x73, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e, 0x6d, 0x69,
+  0x72, 0x6f, 0x73, 0x74, 0x61, 0x74, 0x5f, 0x65, 0x74, 0x61, 0x20, 0x7d,
+  0x29, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x3c, 0x2f, 0x66, 0x69, 0x65, 0x6c, 0x64, 0x73, 0x65,
+  0x74, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x3c, 0x66, 0x69, 0x65, 0x6c, 0x64, 0x73, 0x65, 0x74,
+  0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x24, 0x7b, 0x49, 0x6e, 0x74, 0x46, 0x69, 0x65,
+  0x6c, 0x64, 0x28, 0x7b, 0x20, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x3a, 0x20,
+  0x22, 0x53, 0x68, 0x6f, 0x77, 0x20, 0x50, 0x72, 0x6f, 0x62, 0x61, 0x62,
+  0x69, 0x6c, 0x69, 0x74, 0x69, 0x65, 0x73, 0x22, 0x2c, 0x20, 0x6d, 0x61,
+  0x78, 0x3a, 0x20, 0x31, 0x30, 0x2c, 0x20, 0x6d, 0x69, 0x6e, 0x3a, 0x20,
+  0x30, 0x2c, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x3a, 0x20, 0x22, 0x6e, 0x5f,
+  0x70, 0x72, 0x6f, 0x62, 0x73, 0x22, 0x2c, 0x20, 0x76, 0x61, 0x6c, 0x75,
+  0x65, 0x3a, 0x20, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2e, 0x76, 0x61,
+  0x6c, 0x75, 0x65, 0x2e, 0x6e, 0x5f, 0x70, 0x72, 0x6f, 0x62, 0x73, 0x20,
+  0x7d, 0x29, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x3c, 0x2f, 0x66, 0x69, 0x65, 0x6c, 0x64, 0x73,
+  0x65, 0x74, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x3c, 0x2f, 0x64, 0x65, 0x74, 0x61, 0x69, 0x6c, 0x73, 0x3e,
+  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x2f, 0x66,
+  0x6f, 0x72, 0x6d, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x60,
+  0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20,
+  0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x70, 0x72, 0x6f, 0x62, 0x43, 0x6f,
+  0x6c, 0x6f, 0x72, 0x20, 0x3d, 0x20, 0x28, 0x70, 0x29, 0x20, 0x3d, 0x3e,
+  0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e,
+  0x73, 0x74, 0x20, 0x72, 0x20, 0x3d, 0x20, 0x4d, 0x61, 0x74, 0x68, 0x2e,
+  0x66, 0x6c, 0x6f, 0x6f, 0x72, 0x28, 0x31, 0x39, 0x32, 0x20, 0x2a, 0x20,
+  0x28, 0x31, 0x20, 0x2d, 0x20, 0x70, 0x29, 0x29, 0x3b, 0x0a, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x67, 0x20,
+  0x3d, 0x20, 0x4d, 0x61, 0x74, 0x68, 0x2e, 0x66, 0x6c, 0x6f, 0x6f, 0x72,
+  0x28, 0x31, 0x39, 0x32, 0x20, 0x2a, 0x20, 0x70, 0x29, 0x3b, 0x0a, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20,
+  0x60, 0x72, 0x67, 0x62, 0x61, 0x28, 0x24, 0x7b, 0x72, 0x7d, 0x2c, 0x24,
+  0x7b, 0x67, 0x7d, 0x2c, 0x30, 0x2c, 0x30, 0x2e, 0x33, 0x29, 0x60, 0x3b,
+  0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20,
+  0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x50, 0x72, 0x6f, 0x62, 0x61, 0x62,
+  0x69, 0x6c, 0x69, 0x74, 0x69, 0x65, 0x73, 0x20, 0x3d, 0x20, 0x28, 0x70,
+  0x61, 0x72, 0x61, 0x6d, 0x73, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e,
+  0x20, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2e, 0x64, 0x61, 0x74, 0x61,
+  0x2e, 0x6d, 0x61, 0x70, 0x28, 0x6d, 0x73, 0x67, 0x20, 0x3d, 0x3e, 0x20,
+  0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f,
+  0x6e, 0x73, 0x74, 0x20, 0x7b, 0x20, 0x63, 0x6f, 0x6d, 0x70, 0x6c, 0x65,
+  0x74, 0x69, 0x6f, 0x6e, 0x5f, 0x70, 0x72, 0x6f, 0x62, 0x61, 0x62, 0x69,
+  0x6c, 0x69, 0x74, 0x69, 0x65, 0x73, 0x20, 0x7d, 0x20, 0x3d, 0x20, 0x6d,
+  0x73, 0x67, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x69, 0x66, 0x20, 0x28, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x21, 0x63, 0x6f, 0x6d, 0x70, 0x6c, 0x65, 0x74, 0x69,
+  0x6f, 0x6e, 0x5f, 0x70, 0x72, 0x6f, 0x62, 0x61, 0x62, 0x69, 0x6c, 0x69,
+  0x74, 0x69, 0x65, 0x73, 0x20, 0x7c, 0x7c, 0x0a, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6d, 0x70, 0x6c, 0x65,
+  0x74, 0x69, 0x6f, 0x6e, 0x5f, 0x70, 0x72, 0x6f, 0x62, 0x61, 0x62, 0x69,
+  0x6c, 0x69, 0x74, 0x69, 0x65, 0x73, 0x2e, 0x6c, 0x65, 0x6e, 0x67, 0x74,
+  0x68, 0x20, 0x3d, 0x3d, 0x3d, 0x20, 0x30, 0x0a, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x29, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e,
+  0x20, 0x6d, 0x73, 0x67, 0x2e, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74,
+  0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x66,
+  0x20, 0x28, 0x63, 0x6f, 0x6d, 0x70, 0x6c, 0x65, 0x74, 0x69, 0x6f, 0x6e,
+  0x5f, 0x70, 0x72, 0x6f, 0x62, 0x61, 0x62, 0x69, 0x6c, 0x69, 0x74, 0x69,
+  0x65, 0x73, 0x2e, 0x6c, 0x65, 0x6e, 0x67, 0x74, 0x68, 0x20, 0x3e, 0x20,
+  0x31, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x2f, 0x2f, 0x20, 0x4e, 0x6f, 0x74, 0x20, 0x66, 0x6f,
+  0x72, 0x20, 0x62, 0x79, 0x74, 0x65, 0x20, 0x70, 0x61, 0x69, 0x72, 0x0a,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x66,
+  0x20, 0x28, 0x63, 0x6f, 0x6d, 0x70, 0x6c, 0x65, 0x74, 0x69, 0x6f, 0x6e,
+  0x5f, 0x70, 0x72, 0x6f, 0x62, 0x61, 0x62, 0x69, 0x6c, 0x69, 0x74, 0x69,
+  0x65, 0x73, 0x5b, 0x30, 0x5d, 0x2e, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e,
+  0x74, 0x2e, 0x73, 0x74, 0x61, 0x72, 0x74, 0x73, 0x57, 0x69, 0x74, 0x68,
+  0x28, 0x27, 0x62, 0x79, 0x74, 0x65, 0x3a, 0x20, 0x5c, 0x5c, 0x27, 0x29,
+  0x29, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x6d, 0x73, 0x67,
+  0x2e, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x0a, 0x0a, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73,
+  0x74, 0x20, 0x73, 0x70, 0x6c, 0x69, 0x74, 0x44, 0x61, 0x74, 0x61, 0x20,
+  0x3d, 0x20, 0x63, 0x6f, 0x6d, 0x70, 0x6c, 0x65, 0x74, 0x69, 0x6f, 0x6e,
+  0x5f, 0x70, 0x72, 0x6f, 0x62, 0x61, 0x62, 0x69, 0x6c, 0x69, 0x74, 0x69,
+  0x65, 0x73, 0x2e, 0x6d, 0x61, 0x70, 0x28, 0x70, 0x72, 0x6f, 0x62, 0x20,
+  0x3d, 0x3e, 0x20, 0x28, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e,
+  0x74, 0x3a, 0x20, 0x70, 0x72, 0x6f, 0x62, 0x2e, 0x63, 0x6f, 0x6e, 0x74,
+  0x65, 0x6e, 0x74, 0x2c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6d, 0x70, 0x6c, 0x65, 0x74,
+  0x69, 0x6f, 0x6e, 0x5f, 0x70, 0x72, 0x6f, 0x62, 0x61, 0x62, 0x69, 0x6c,
+  0x69, 0x74, 0x69, 0x65, 0x73, 0x3a, 0x20, 0x5b, 0x70, 0x72, 0x6f, 0x62,
+  0x5d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x7d, 0x29, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x68, 0x74, 0x6d,
+  0x6c, 0x60, 0x3c, 0x24, 0x7b, 0x50, 0x72, 0x6f, 0x62, 0x61, 0x62, 0x69,
+  0x6c, 0x69, 0x74, 0x69, 0x65, 0x73, 0x7d, 0x20, 0x64, 0x61, 0x74, 0x61,
+  0x3d, 0x24, 0x7b, 0x73, 0x70, 0x6c, 0x69, 0x74, 0x44, 0x61, 0x74, 0x61,
+  0x7d, 0x20, 0x2f, 0x3e, 0x60, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x7b, 0x20, 0x70, 0x72, 0x6f,
+  0x62, 0x73, 0x2c, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x20,
+  0x7d, 0x20, 0x3d, 0x20, 0x63, 0x6f, 0x6d, 0x70, 0x6c, 0x65, 0x74, 0x69,
+  0x6f, 0x6e, 0x5f, 0x70, 0x72, 0x6f, 0x62, 0x61, 0x62, 0x69, 0x6c, 0x69,
+  0x74, 0x69, 0x65, 0x73, 0x5b, 0x30, 0x5d, 0x0a, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x66, 0x6f,
+  0x75, 0x6e, 0x64, 0x20, 0x3d, 0x20, 0x70, 0x72, 0x6f, 0x62, 0x73, 0x2e,
+  0x66, 0x69, 0x6e, 0x64, 0x28, 0x70, 0x20, 0x3d, 0x3e, 0x20, 0x70, 0x2e,
+  0x74, 0x6f, 0x6b, 0x5f, 0x73, 0x74, 0x72, 0x20, 0x3d, 0x3d, 0x3d, 0x20,
+  0x6d, 0x73, 0x67, 0x2e, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x29,
+  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e,
+  0x73, 0x74, 0x20, 0x70, 0x43, 0x6f, 0x6c, 0x6f, 0x72, 0x20, 0x3d, 0x20,
+  0x66, 0x6f, 0x75, 0x6e, 0x64, 0x20, 0x3f, 0x20, 0x70, 0x72, 0x6f, 0x62,
+  0x43, 0x6f, 0x6c, 0x6f, 0x72, 0x28, 0x66, 0x6f, 0x75, 0x6e, 0x64, 0x2e,
+  0x70, 0x72, 0x6f, 0x62, 0x29, 0x20, 0x3a, 0x20, 0x27, 0x74, 0x72, 0x61,
+  0x6e, 0x73, 0x70, 0x61, 0x72, 0x65, 0x6e, 0x74, 0x27, 0x0a, 0x0a, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74,
+  0x20, 0x70, 0x6f, 0x70, 0x6f, 0x76, 0x65, 0x72, 0x43, 0x68, 0x69, 0x6c,
+  0x64, 0x72, 0x65, 0x6e, 0x20, 0x3d, 0x20, 0x68, 0x74, 0x6d, 0x6c, 0x60,
+  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c,
+  0x64, 0x69, 0x76, 0x20, 0x63, 0x6c, 0x61, 0x73, 0x73, 0x3d, 0x22, 0x70,
+  0x72, 0x6f, 0x62, 0x2d, 0x73, 0x65, 0x74, 0x22, 0x3e, 0x0a, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x24, 0x7b,
+  0x70, 0x72, 0x6f, 0x62, 0x73, 0x2e, 0x6d, 0x61, 0x70, 0x28, 0x28, 0x70,
+  0x2c, 0x20, 0x69, 0x6e, 0x64, 0x65, 0x78, 0x29, 0x20, 0x3d, 0x3e, 0x20,
+  0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x68, 0x74, 0x6d, 0x6c, 0x60,
+  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x64, 0x69, 0x76, 0x0a, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x6b, 0x65, 0x79, 0x3d, 0x24, 0x7b, 0x69, 0x6e,
+  0x64, 0x65, 0x78, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x74,
+  0x69, 0x74, 0x6c, 0x65, 0x3d, 0x24, 0x7b, 0x60, 0x70, 0x72, 0x6f, 0x62,
+  0x3a, 0x20, 0x24, 0x7b, 0x70, 0x2e, 0x70, 0x72, 0x6f, 0x62, 0x7d, 0x60,
+  0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x73, 0x74, 0x79, 0x6c,
+  0x65, 0x3d, 0x24, 0x7b, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
   0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x70, 0x61, 0x64, 0x64,
   0x69, 0x6e, 0x67, 0x3a, 0x20, 0x27, 0x30, 0x2e, 0x33, 0x65, 0x6d, 0x27,
   0x2c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x62, 0x61,
-  0x63, 0x6b, 0x67, 0x72, 0x6f, 0x75, 0x6e, 0x64, 0x43, 0x6f, 0x6c, 0x6f,
-  0x72, 0x3a, 0x20, 0x70, 0x2e, 0x74, 0x6f, 0x6b, 0x5f, 0x73, 0x74, 0x72,
-  0x20, 0x3d, 0x3d, 0x3d, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74,
-  0x20, 0x3f, 0x20, 0x70, 0x72, 0x6f, 0x62, 0x43, 0x6f, 0x6c, 0x6f, 0x72,
-  0x28, 0x70, 0x2e, 0x70, 0x72, 0x6f, 0x62, 0x29, 0x20, 0x3a, 0x20, 0x27,
-  0x74, 0x72, 0x61, 0x6e, 0x73, 0x70, 0x61, 0x72, 0x65, 0x6e, 0x74, 0x27,
-  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x62, 0x61, 0x63, 0x6b, 0x67, 0x72, 0x6f, 0x75,
+  0x6e, 0x64, 0x43, 0x6f, 0x6c, 0x6f, 0x72, 0x3a, 0x20, 0x70, 0x2e, 0x74,
+  0x6f, 0x6b, 0x5f, 0x73, 0x74, 0x72, 0x20, 0x3d, 0x3d, 0x3d, 0x20, 0x63,
+  0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x20, 0x3f, 0x20, 0x70, 0x72, 0x6f,
+  0x62, 0x43, 0x6f, 0x6c, 0x6f, 0x72, 0x28, 0x70, 0x2e, 0x70, 0x72, 0x6f,
+  0x62, 0x29, 0x20, 0x3a, 0x20, 0x27, 0x74, 0x72, 0x61, 0x6e, 0x73, 0x70,
+  0x61, 0x72, 0x65, 0x6e, 0x74, 0x27, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
   0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x7d, 0x0a, 0x20, 0x20,
   0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
   0x20, 0x20, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
@@ -2056,464 +2210,470 @@ unsigned char index_html[] = {
   0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x2f, 0x64, 0x69, 0x76, 0x3e,
   0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
   0x20, 0x20, 0x20, 0x60, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x29, 0x7d, 0x0a, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x2f, 0x64, 0x69, 0x76,
-  0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x60, 0x0a,
-  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x74,
-  0x75, 0x72, 0x6e, 0x20, 0x68, 0x74, 0x6d, 0x6c, 0x60, 0x0a, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x24, 0x7b, 0x50,
-  0x6f, 0x70, 0x6f, 0x76, 0x65, 0x72, 0x7d, 0x20, 0x73, 0x74, 0x79, 0x6c,
-  0x65, 0x3d, 0x24, 0x7b, 0x7b, 0x20, 0x62, 0x61, 0x63, 0x6b, 0x67, 0x72,
-  0x6f, 0x75, 0x6e, 0x64, 0x43, 0x6f, 0x6c, 0x6f, 0x72, 0x3a, 0x20, 0x70,
-  0x43, 0x6f, 0x6c, 0x6f, 0x72, 0x20, 0x7d, 0x7d, 0x20, 0x70, 0x6f, 0x70,
-  0x6f, 0x76, 0x65, 0x72, 0x43, 0x68, 0x69, 0x6c, 0x64, 0x72, 0x65, 0x6e,
-  0x3d, 0x24, 0x7b, 0x70, 0x6f, 0x70, 0x6f, 0x76, 0x65, 0x72, 0x43, 0x68,
-  0x69, 0x6c, 0x64, 0x72, 0x65, 0x6e, 0x7d, 0x3e, 0x0a, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x24, 0x7b, 0x6d,
-  0x73, 0x67, 0x2e, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x2e, 0x6d,
-  0x61, 0x74, 0x63, 0x68, 0x28, 0x2f, 0x5c, 0x6e, 0x2f, 0x67, 0x69, 0x6d,
-  0x29, 0x20, 0x3f, 0x20, 0x68, 0x74, 0x6d, 0x6c, 0x60, 0x3c, 0x62, 0x72,
-  0x20, 0x2f, 0x3e, 0x60, 0x20, 0x3a, 0x20, 0x6d, 0x73, 0x67, 0x2e, 0x63,
-  0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x2f, 0x3e, 0x0a, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x60, 0x0a, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x7d, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a,
-  0x0a, 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f, 0x20, 0x70, 0x6f, 0x6f, 0x72,
-  0x20, 0x6d, 0x61, 0x6e, 0x73, 0x20, 0x6d, 0x61, 0x72, 0x6b, 0x64, 0x6f,
-  0x77, 0x6e, 0x20, 0x72, 0x65, 0x70, 0x6c, 0x61, 0x63, 0x65, 0x6d, 0x65,
-  0x6e, 0x74, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74,
-  0x20, 0x4d, 0x61, 0x72, 0x6b, 0x64, 0x6f, 0x77, 0x6e, 0x69, 0x73, 0x68,
-  0x20, 0x3d, 0x20, 0x28, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x29, 0x20,
-  0x3d, 0x3e, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63,
-  0x6f, 0x6e, 0x73, 0x74, 0x20, 0x6d, 0x64, 0x20, 0x3d, 0x20, 0x70, 0x61,
-  0x72, 0x61, 0x6d, 0x73, 0x2e, 0x74, 0x65, 0x78, 0x74, 0x0a, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2e, 0x72, 0x65, 0x70, 0x6c, 0x61,
-  0x63, 0x65, 0x28, 0x2f, 0x26, 0x2f, 0x67, 0x2c, 0x20, 0x27, 0x26, 0x61,
-  0x6d, 0x70, 0x3b, 0x27, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x7d, 0x29, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x3c, 0x2f, 0x64, 0x69, 0x76, 0x3e, 0x0a, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x60, 0x0a, 0x0a, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20,
+  0x68, 0x74, 0x6d, 0x6c, 0x60, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x3c, 0x24, 0x7b, 0x50, 0x6f, 0x70, 0x6f, 0x76,
+  0x65, 0x72, 0x7d, 0x20, 0x73, 0x74, 0x79, 0x6c, 0x65, 0x3d, 0x24, 0x7b,
+  0x7b, 0x20, 0x62, 0x61, 0x63, 0x6b, 0x67, 0x72, 0x6f, 0x75, 0x6e, 0x64,
+  0x43, 0x6f, 0x6c, 0x6f, 0x72, 0x3a, 0x20, 0x70, 0x43, 0x6f, 0x6c, 0x6f,
+  0x72, 0x20, 0x7d, 0x7d, 0x20, 0x70, 0x6f, 0x70, 0x6f, 0x76, 0x65, 0x72,
+  0x43, 0x68, 0x69, 0x6c, 0x64, 0x72, 0x65, 0x6e, 0x3d, 0x24, 0x7b, 0x70,
+  0x6f, 0x70, 0x6f, 0x76, 0x65, 0x72, 0x43, 0x68, 0x69, 0x6c, 0x64, 0x72,
+  0x65, 0x6e, 0x7d, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x24, 0x7b, 0x6d, 0x73, 0x67, 0x2e, 0x63,
+  0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x2e, 0x6d, 0x61, 0x74, 0x63, 0x68,
+  0x28, 0x2f, 0x5c, 0x6e, 0x2f, 0x67, 0x69, 0x6d, 0x29, 0x20, 0x3f, 0x20,
+  0x68, 0x74, 0x6d, 0x6c, 0x60, 0x3c, 0x62, 0x72, 0x20, 0x2f, 0x3e, 0x60,
+  0x20, 0x3a, 0x20, 0x6d, 0x73, 0x67, 0x2e, 0x63, 0x6f, 0x6e, 0x74, 0x65,
+  0x6e, 0x74, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x3c, 0x2f, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x60, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x29,
+  0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20,
+  0x20, 0x2f, 0x2f, 0x20, 0x70, 0x6f, 0x6f, 0x72, 0x20, 0x6d, 0x61, 0x6e,
+  0x73, 0x20, 0x6d, 0x61, 0x72, 0x6b, 0x64, 0x6f, 0x77, 0x6e, 0x20, 0x72,
+  0x65, 0x70, 0x6c, 0x61, 0x63, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x0a, 0x20,
+  0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x4d, 0x61, 0x72,
+  0x6b, 0x64, 0x6f, 0x77, 0x6e, 0x69, 0x73, 0x68, 0x20, 0x3d, 0x20, 0x28,
+  0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b,
+  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74,
+  0x20, 0x6d, 0x64, 0x20, 0x3d, 0x20, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73,
+  0x2e, 0x74, 0x65, 0x78, 0x74, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
   0x20, 0x20, 0x2e, 0x72, 0x65, 0x70, 0x6c, 0x61, 0x63, 0x65, 0x28, 0x2f,
-  0x3c, 0x2f, 0x67, 0x2c, 0x20, 0x27, 0x26, 0x6c, 0x74, 0x3b, 0x27, 0x29,
-  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2e, 0x72, 0x65,
-  0x70, 0x6c, 0x61, 0x63, 0x65, 0x28, 0x2f, 0x3e, 0x2f, 0x67, 0x2c, 0x20,
-  0x27, 0x26, 0x67, 0x74, 0x3b, 0x27, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x2e, 0x72, 0x65, 0x70, 0x6c, 0x61, 0x63, 0x65,
-  0x28, 0x2f, 0x5e, 0x23, 0x7b, 0x31, 0x2c, 0x36, 0x7d, 0x20, 0x28, 0x2e,
-  0x2a, 0x29, 0x24, 0x2f, 0x67, 0x69, 0x6d, 0x2c, 0x20, 0x27, 0x3c, 0x68,
-  0x33, 0x3e, 0x24, 0x31, 0x3c, 0x2f, 0x68, 0x33, 0x3e, 0x27, 0x29, 0x0a,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2e, 0x72, 0x65, 0x70,
-  0x6c, 0x61, 0x63, 0x65, 0x28, 0x2f, 0x5c, 0x2a, 0x5c, 0x2a, 0x28, 0x2e,
-  0x2a, 0x3f, 0x29, 0x5c, 0x2a, 0x5c, 0x2a, 0x2f, 0x67, 0x2c, 0x20, 0x27,
-  0x3c, 0x73, 0x74, 0x72, 0x6f, 0x6e, 0x67, 0x3e, 0x24, 0x31, 0x3c, 0x2f,
-  0x73, 0x74, 0x72, 0x6f, 0x6e, 0x67, 0x3e, 0x27, 0x29, 0x0a, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2e, 0x72, 0x65, 0x70, 0x6c, 0x61,
-  0x63, 0x65, 0x28, 0x2f, 0x5f, 0x5f, 0x28, 0x2e, 0x2a, 0x3f, 0x29, 0x5f,
-  0x5f, 0x2f, 0x67, 0x2c, 0x20, 0x27, 0x3c, 0x73, 0x74, 0x72, 0x6f, 0x6e,
-  0x67, 0x3e, 0x24, 0x31, 0x3c, 0x2f, 0x73, 0x74, 0x72, 0x6f, 0x6e, 0x67,
-  0x3e, 0x27, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x2e, 0x72, 0x65, 0x70, 0x6c, 0x61, 0x63, 0x65, 0x28, 0x2f, 0x5c, 0x2a,
-  0x28, 0x2e, 0x2a, 0x3f, 0x29, 0x5c, 0x2a, 0x2f, 0x67, 0x2c, 0x20, 0x27,
-  0x3c, 0x65, 0x6d, 0x3e, 0x24, 0x31, 0x3c, 0x2f, 0x65, 0x6d, 0x3e, 0x27,
+  0x26, 0x2f, 0x67, 0x2c, 0x20, 0x27, 0x26, 0x61, 0x6d, 0x70, 0x3b, 0x27,
   0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2e, 0x72,
-  0x65, 0x70, 0x6c, 0x61, 0x63, 0x65, 0x28, 0x2f, 0x5f, 0x28, 0x2e, 0x2a,
-  0x3f, 0x29, 0x5f, 0x2f, 0x67, 0x2c, 0x20, 0x27, 0x3c, 0x65, 0x6d, 0x3e,
+  0x65, 0x70, 0x6c, 0x61, 0x63, 0x65, 0x28, 0x2f, 0x3c, 0x2f, 0x67, 0x2c,
+  0x20, 0x27, 0x26, 0x6c, 0x74, 0x3b, 0x27, 0x29, 0x0a, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x2e, 0x72, 0x65, 0x70, 0x6c, 0x61, 0x63,
+  0x65, 0x28, 0x2f, 0x3e, 0x2f, 0x67, 0x2c, 0x20, 0x27, 0x26, 0x67, 0x74,
+  0x3b, 0x27, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x2e, 0x72, 0x65, 0x70, 0x6c, 0x61, 0x63, 0x65, 0x28, 0x2f, 0x5e, 0x23,
+  0x7b, 0x31, 0x2c, 0x36, 0x7d, 0x20, 0x28, 0x2e, 0x2a, 0x29, 0x24, 0x2f,
+  0x67, 0x69, 0x6d, 0x2c, 0x20, 0x27, 0x3c, 0x68, 0x33, 0x3e, 0x24, 0x31,
+  0x3c, 0x2f, 0x68, 0x33, 0x3e, 0x27, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x2e, 0x72, 0x65, 0x70, 0x6c, 0x61, 0x63, 0x65,
+  0x28, 0x2f, 0x5c, 0x2a, 0x5c, 0x2a, 0x28, 0x2e, 0x2a, 0x3f, 0x29, 0x5c,
+  0x2a, 0x5c, 0x2a, 0x2f, 0x67, 0x2c, 0x20, 0x27, 0x3c, 0x73, 0x74, 0x72,
+  0x6f, 0x6e, 0x67, 0x3e, 0x24, 0x31, 0x3c, 0x2f, 0x73, 0x74, 0x72, 0x6f,
+  0x6e, 0x67, 0x3e, 0x27, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x2e, 0x72, 0x65, 0x70, 0x6c, 0x61, 0x63, 0x65, 0x28, 0x2f,
+  0x5f, 0x5f, 0x28, 0x2e, 0x2a, 0x3f, 0x29, 0x5f, 0x5f, 0x2f, 0x67, 0x2c,
+  0x20, 0x27, 0x3c, 0x73, 0x74, 0x72, 0x6f, 0x6e, 0x67, 0x3e, 0x24, 0x31,
+  0x3c, 0x2f, 0x73, 0x74, 0x72, 0x6f, 0x6e, 0x67, 0x3e, 0x27, 0x29, 0x0a,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2e, 0x72, 0x65, 0x70,
+  0x6c, 0x61, 0x63, 0x65, 0x28, 0x2f, 0x5c, 0x2a, 0x28, 0x2e, 0x2a, 0x3f,
+  0x29, 0x5c, 0x2a, 0x2f, 0x67, 0x2c, 0x20, 0x27, 0x3c, 0x65, 0x6d, 0x3e,
   0x24, 0x31, 0x3c, 0x2f, 0x65, 0x6d, 0x3e, 0x27, 0x29, 0x0a, 0x20, 0x20,
   0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2e, 0x72, 0x65, 0x70, 0x6c, 0x61,
-  0x63, 0x65, 0x28, 0x2f, 0x60, 0x60, 0x60, 0x2e, 0x2a, 0x3f, 0x5c, 0x6e,
-  0x28, 0x5b, 0x5c, 0x73, 0x5c, 0x53, 0x5d, 0x2a, 0x3f, 0x29, 0x60, 0x60,
-  0x60, 0x2f, 0x67, 0x2c, 0x20, 0x27, 0x3c, 0x70, 0x72, 0x65, 0x3e, 0x3c,
-  0x63, 0x6f, 0x64, 0x65, 0x3e, 0x24, 0x31, 0x3c, 0x2f, 0x63, 0x6f, 0x64,
-  0x65, 0x3e, 0x3c, 0x2f, 0x70, 0x72, 0x65, 0x3e, 0x27, 0x29, 0x0a, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2e, 0x72, 0x65, 0x70, 0x6c,
-  0x61, 0x63, 0x65, 0x28, 0x2f, 0x60, 0x28, 0x2e, 0x2a, 0x3f, 0x29, 0x60,
-  0x2f, 0x67, 0x2c, 0x20, 0x27, 0x3c, 0x63, 0x6f, 0x64, 0x65, 0x3e, 0x24,
-  0x31, 0x3c, 0x2f, 0x63, 0x6f, 0x64, 0x65, 0x3e, 0x27, 0x29, 0x0a, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2e, 0x72, 0x65, 0x70, 0x6c,
-  0x61, 0x63, 0x65, 0x28, 0x2f, 0x5c, 0x6e, 0x2f, 0x67, 0x69, 0x6d, 0x2c,
-  0x20, 0x27, 0x3c, 0x62, 0x72, 0x20, 0x2f, 0x3e, 0x27, 0x29, 0x3b, 0x0a,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e,
-  0x20, 0x68, 0x74, 0x6d, 0x6c, 0x60, 0x3c, 0x73, 0x70, 0x61, 0x6e, 0x20,
-  0x64, 0x61, 0x6e, 0x67, 0x65, 0x72, 0x6f, 0x75, 0x73, 0x6c, 0x79, 0x53,
-  0x65, 0x74, 0x49, 0x6e, 0x6e, 0x65, 0x72, 0x48, 0x54, 0x4d, 0x4c, 0x3d,
-  0x24, 0x7b, 0x7b, 0x20, 0x5f, 0x5f, 0x68, 0x74, 0x6d, 0x6c, 0x3a, 0x20,
-  0x6d, 0x64, 0x20, 0x7d, 0x7d, 0x20, 0x2f, 0x3e, 0x60, 0x3b, 0x0a, 0x20,
-  0x20, 0x20, 0x20, 0x7d, 0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63,
-  0x6f, 0x6e, 0x73, 0x74, 0x20, 0x4d, 0x6f, 0x64, 0x65, 0x6c, 0x47, 0x65,
-  0x6e, 0x65, 0x72, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x49, 0x6e, 0x66, 0x6f,
-  0x20, 0x3d, 0x20, 0x28, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x29, 0x20,
-  0x3d, 0x3e, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69,
-  0x66, 0x20, 0x28, 0x21, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x53, 0x74, 0x61,
-  0x74, 0x73, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x29, 0x20, 0x7b, 0x0a,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75,
-  0x72, 0x6e, 0x20, 0x68, 0x74, 0x6d, 0x6c, 0x60, 0x3c, 0x73, 0x70, 0x61,
-  0x6e, 0x2f, 0x3e, 0x60, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d,
-  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72,
-  0x6e, 0x20, 0x68, 0x74, 0x6d, 0x6c, 0x60, 0x0a, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x3c, 0x73, 0x70, 0x61, 0x6e, 0x3e, 0x0a, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x24, 0x7b, 0x6c,
-  0x6c, 0x61, 0x6d, 0x61, 0x53, 0x74, 0x61, 0x74, 0x73, 0x2e, 0x76, 0x61,
-  0x6c, 0x75, 0x65, 0x2e, 0x70, 0x72, 0x65, 0x64, 0x69, 0x63, 0x74, 0x65,
-  0x64, 0x5f, 0x70, 0x65, 0x72, 0x5f, 0x74, 0x6f, 0x6b, 0x65, 0x6e, 0x5f,
-  0x6d, 0x73, 0x2e, 0x74, 0x6f, 0x46, 0x69, 0x78, 0x65, 0x64, 0x28, 0x29,
-  0x7d, 0x6d, 0x73, 0x20, 0x70, 0x65, 0x72, 0x20, 0x74, 0x6f, 0x6b, 0x65,
-  0x6e, 0x2c, 0x20, 0x24, 0x7b, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x53, 0x74,
-  0x61, 0x74, 0x73, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e, 0x70, 0x72,
-  0x65, 0x64, 0x69, 0x63, 0x74, 0x65, 0x64, 0x5f, 0x70, 0x65, 0x72, 0x5f,
-  0x73, 0x65, 0x63, 0x6f, 0x6e, 0x64, 0x2e, 0x74, 0x6f, 0x46, 0x69, 0x78,
-  0x65, 0x64, 0x28, 0x32, 0x29, 0x7d, 0x20, 0x74, 0x6f, 0x6b, 0x65, 0x6e,
-  0x73, 0x20, 0x70, 0x65, 0x72, 0x20, 0x73, 0x65, 0x63, 0x6f, 0x6e, 0x64,
-  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x2f, 0x73,
-  0x70, 0x61, 0x6e, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x60,
-  0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20,
-  0x2f, 0x2f, 0x20, 0x73, 0x69, 0x6d, 0x70, 0x6c, 0x65, 0x20, 0x70, 0x6f,
-  0x70, 0x6f, 0x76, 0x65, 0x72, 0x20, 0x69, 0x6d, 0x70, 0x6c, 0x0a, 0x20,
-  0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x50, 0x6f, 0x70,
-  0x6f, 0x76, 0x65, 0x72, 0x20, 0x3d, 0x20, 0x28, 0x70, 0x72, 0x6f, 0x70,
-  0x73, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x69, 0x73, 0x4f, 0x70,
-  0x65, 0x6e, 0x20, 0x3d, 0x20, 0x75, 0x73, 0x65, 0x53, 0x69, 0x67, 0x6e,
-  0x61, 0x6c, 0x28, 0x66, 0x61, 0x6c, 0x73, 0x65, 0x29, 0x3b, 0x0a, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x70,
-  0x6f, 0x73, 0x69, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x3d, 0x20, 0x75, 0x73,
-  0x65, 0x53, 0x69, 0x67, 0x6e, 0x61, 0x6c, 0x28, 0x7b, 0x20, 0x74, 0x6f,
-  0x70, 0x3a, 0x20, 0x27, 0x30, 0x70, 0x78, 0x27, 0x2c, 0x20, 0x6c, 0x65,
-  0x66, 0x74, 0x3a, 0x20, 0x27, 0x30, 0x70, 0x78, 0x27, 0x20, 0x7d, 0x29,
-  0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73,
-  0x74, 0x20, 0x62, 0x75, 0x74, 0x74, 0x6f, 0x6e, 0x52, 0x65, 0x66, 0x20,
-  0x3d, 0x20, 0x75, 0x73, 0x65, 0x52, 0x65, 0x66, 0x28, 0x6e, 0x75, 0x6c,
-  0x6c, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f,
-  0x6e, 0x73, 0x74, 0x20, 0x70, 0x6f, 0x70, 0x6f, 0x76, 0x65, 0x72, 0x52,
-  0x65, 0x66, 0x20, 0x3d, 0x20, 0x75, 0x73, 0x65, 0x52, 0x65, 0x66, 0x28,
-  0x6e, 0x75, 0x6c, 0x6c, 0x29, 0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x74, 0x6f, 0x67, 0x67,
-  0x6c, 0x65, 0x50, 0x6f, 0x70, 0x6f, 0x76, 0x65, 0x72, 0x20, 0x3d, 0x20,
-  0x28, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x62, 0x75, 0x74, 0x74,
-  0x6f, 0x6e, 0x52, 0x65, 0x66, 0x2e, 0x63, 0x75, 0x72, 0x72, 0x65, 0x6e,
-  0x74, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x72, 0x65, 0x63,
-  0x74, 0x20, 0x3d, 0x20, 0x62, 0x75, 0x74, 0x74, 0x6f, 0x6e, 0x52, 0x65,
-  0x66, 0x2e, 0x63, 0x75, 0x72, 0x72, 0x65, 0x6e, 0x74, 0x2e, 0x67, 0x65,
-  0x74, 0x42, 0x6f, 0x75, 0x6e, 0x64, 0x69, 0x6e, 0x67, 0x43, 0x6c, 0x69,
-  0x65, 0x6e, 0x74, 0x52, 0x65, 0x63, 0x74, 0x28, 0x29, 0x3b, 0x0a, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x70, 0x6f, 0x73,
-  0x69, 0x74, 0x69, 0x6f, 0x6e, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20,
-  0x3d, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x74, 0x6f, 0x70, 0x3a, 0x20, 0x60, 0x24, 0x7b,
-  0x72, 0x65, 0x63, 0x74, 0x2e, 0x62, 0x6f, 0x74, 0x74, 0x6f, 0x6d, 0x20,
-  0x2b, 0x20, 0x77, 0x69, 0x6e, 0x64, 0x6f, 0x77, 0x2e, 0x73, 0x63, 0x72,
-  0x6f, 0x6c, 0x6c, 0x59, 0x7d, 0x70, 0x78, 0x60, 0x2c, 0x0a, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6c, 0x65,
-  0x66, 0x74, 0x3a, 0x20, 0x60, 0x24, 0x7b, 0x72, 0x65, 0x63, 0x74, 0x2e,
-  0x6c, 0x65, 0x66, 0x74, 0x20, 0x2b, 0x20, 0x77, 0x69, 0x6e, 0x64, 0x6f,
-  0x77, 0x2e, 0x73, 0x63, 0x72, 0x6f, 0x6c, 0x6c, 0x58, 0x7d, 0x70, 0x78,
-  0x60, 0x2c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x7d, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x73,
-  0x4f, 0x70, 0x65, 0x6e, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x3d,
-  0x20, 0x21, 0x69, 0x73, 0x4f, 0x70, 0x65, 0x6e, 0x2e, 0x76, 0x61, 0x6c,
-  0x75, 0x65, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x3b,
-  0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73,
-  0x74, 0x20, 0x68, 0x61, 0x6e, 0x64, 0x6c, 0x65, 0x43, 0x6c, 0x69, 0x63,
-  0x6b, 0x4f, 0x75, 0x74, 0x73, 0x69, 0x64, 0x65, 0x20, 0x3d, 0x20, 0x28,
-  0x65, 0x76, 0x65, 0x6e, 0x74, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28,
-  0x70, 0x6f, 0x70, 0x6f, 0x76, 0x65, 0x72, 0x52, 0x65, 0x66, 0x2e, 0x63,
-  0x75, 0x72, 0x72, 0x65, 0x6e, 0x74, 0x20, 0x26, 0x26, 0x20, 0x21, 0x70,
-  0x6f, 0x70, 0x6f, 0x76, 0x65, 0x72, 0x52, 0x65, 0x66, 0x2e, 0x63, 0x75,
-  0x72, 0x72, 0x65, 0x6e, 0x74, 0x2e, 0x63, 0x6f, 0x6e, 0x74, 0x61, 0x69,
-  0x6e, 0x73, 0x28, 0x65, 0x76, 0x65, 0x6e, 0x74, 0x2e, 0x74, 0x61, 0x72,
-  0x67, 0x65, 0x74, 0x29, 0x20, 0x26, 0x26, 0x20, 0x21, 0x62, 0x75, 0x74,
-  0x74, 0x6f, 0x6e, 0x52, 0x65, 0x66, 0x2e, 0x63, 0x75, 0x72, 0x72, 0x65,
-  0x6e, 0x74, 0x2e, 0x63, 0x6f, 0x6e, 0x74, 0x61, 0x69, 0x6e, 0x73, 0x28,
-  0x65, 0x76, 0x65, 0x6e, 0x74, 0x2e, 0x74, 0x61, 0x72, 0x67, 0x65, 0x74,
-  0x29, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x69, 0x73, 0x4f, 0x70, 0x65, 0x6e, 0x2e, 0x76, 0x61,
-  0x6c, 0x75, 0x65, 0x20, 0x3d, 0x20, 0x66, 0x61, 0x6c, 0x73, 0x65, 0x3b,
-  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x75, 0x73, 0x65, 0x45, 0x66, 0x66, 0x65, 0x63, 0x74,
-  0x28, 0x28, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x64, 0x6f, 0x63, 0x75, 0x6d, 0x65, 0x6e,
-  0x74, 0x2e, 0x61, 0x64, 0x64, 0x45, 0x76, 0x65, 0x6e, 0x74, 0x4c, 0x69,
-  0x73, 0x74, 0x65, 0x6e, 0x65, 0x72, 0x28, 0x27, 0x6d, 0x6f, 0x75, 0x73,
-  0x65, 0x64, 0x6f, 0x77, 0x6e, 0x27, 0x2c, 0x20, 0x68, 0x61, 0x6e, 0x64,
-  0x6c, 0x65, 0x43, 0x6c, 0x69, 0x63, 0x6b, 0x4f, 0x75, 0x74, 0x73, 0x69,
-  0x64, 0x65, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x28, 0x29, 0x20, 0x3d,
-  0x3e, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x64, 0x6f, 0x63, 0x75, 0x6d, 0x65, 0x6e, 0x74, 0x2e, 0x72,
-  0x65, 0x6d, 0x6f, 0x76, 0x65, 0x45, 0x76, 0x65, 0x6e, 0x74, 0x4c, 0x69,
-  0x73, 0x74, 0x65, 0x6e, 0x65, 0x72, 0x28, 0x27, 0x6d, 0x6f, 0x75, 0x73,
-  0x65, 0x64, 0x6f, 0x77, 0x6e, 0x27, 0x2c, 0x20, 0x68, 0x61, 0x6e, 0x64,
-  0x6c, 0x65, 0x43, 0x6c, 0x69, 0x63, 0x6b, 0x4f, 0x75, 0x74, 0x73, 0x69,
-  0x64, 0x65, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x7d, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x2c,
-  0x20, 0x5b, 0x5d, 0x29, 0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x68, 0x74, 0x6d, 0x6c,
-  0x60, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x73,
-  0x70, 0x61, 0x6e, 0x20, 0x73, 0x74, 0x79, 0x6c, 0x65, 0x3d, 0x24, 0x7b,
-  0x70, 0x72, 0x6f, 0x70, 0x73, 0x2e, 0x73, 0x74, 0x79, 0x6c, 0x65, 0x7d,
-  0x20, 0x72, 0x65, 0x66, 0x3d, 0x24, 0x7b, 0x62, 0x75, 0x74, 0x74, 0x6f,
-  0x6e, 0x52, 0x65, 0x66, 0x7d, 0x20, 0x6f, 0x6e, 0x43, 0x6c, 0x69, 0x63,
-  0x6b, 0x3d, 0x24, 0x7b, 0x74, 0x6f, 0x67, 0x67, 0x6c, 0x65, 0x50, 0x6f,
-  0x70, 0x6f, 0x76, 0x65, 0x72, 0x7d, 0x3e, 0x24, 0x7b, 0x70, 0x72, 0x6f,
-  0x70, 0x73, 0x2e, 0x63, 0x68, 0x69, 0x6c, 0x64, 0x72, 0x65, 0x6e, 0x7d,
-  0x3c, 0x2f, 0x73, 0x70, 0x61, 0x6e, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x24, 0x7b, 0x69, 0x73, 0x4f, 0x70, 0x65, 0x6e,
-  0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x26, 0x26, 0x20, 0x68, 0x74,
+  0x63, 0x65, 0x28, 0x2f, 0x5f, 0x28, 0x2e, 0x2a, 0x3f, 0x29, 0x5f, 0x2f,
+  0x67, 0x2c, 0x20, 0x27, 0x3c, 0x65, 0x6d, 0x3e, 0x24, 0x31, 0x3c, 0x2f,
+  0x65, 0x6d, 0x3e, 0x27, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x2e, 0x72, 0x65, 0x70, 0x6c, 0x61, 0x63, 0x65, 0x28, 0x2f,
+  0x60, 0x60, 0x60, 0x2e, 0x2a, 0x3f, 0x5c, 0x6e, 0x28, 0x5b, 0x5c, 0x73,
+  0x5c, 0x53, 0x5d, 0x2a, 0x3f, 0x29, 0x60, 0x60, 0x60, 0x2f, 0x67, 0x2c,
+  0x20, 0x27, 0x3c, 0x70, 0x72, 0x65, 0x3e, 0x3c, 0x63, 0x6f, 0x64, 0x65,
+  0x3e, 0x24, 0x31, 0x3c, 0x2f, 0x63, 0x6f, 0x64, 0x65, 0x3e, 0x3c, 0x2f,
+  0x70, 0x72, 0x65, 0x3e, 0x27, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x2e, 0x72, 0x65, 0x70, 0x6c, 0x61, 0x63, 0x65, 0x28,
+  0x2f, 0x60, 0x28, 0x2e, 0x2a, 0x3f, 0x29, 0x60, 0x2f, 0x67, 0x2c, 0x20,
+  0x27, 0x3c, 0x63, 0x6f, 0x64, 0x65, 0x3e, 0x24, 0x31, 0x3c, 0x2f, 0x63,
+  0x6f, 0x64, 0x65, 0x3e, 0x27, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x2e, 0x72, 0x65, 0x70, 0x6c, 0x61, 0x63, 0x65, 0x28,
+  0x2f, 0x5c, 0x6e, 0x2f, 0x67, 0x69, 0x6d, 0x2c, 0x20, 0x27, 0x3c, 0x62,
+  0x72, 0x20, 0x2f, 0x3e, 0x27, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x68, 0x74, 0x6d,
+  0x6c, 0x60, 0x3c, 0x73, 0x70, 0x61, 0x6e, 0x20, 0x64, 0x61, 0x6e, 0x67,
+  0x65, 0x72, 0x6f, 0x75, 0x73, 0x6c, 0x79, 0x53, 0x65, 0x74, 0x49, 0x6e,
+  0x6e, 0x65, 0x72, 0x48, 0x54, 0x4d, 0x4c, 0x3d, 0x24, 0x7b, 0x7b, 0x20,
+  0x5f, 0x5f, 0x68, 0x74, 0x6d, 0x6c, 0x3a, 0x20, 0x6d, 0x64, 0x20, 0x7d,
+  0x7d, 0x20, 0x2f, 0x3e, 0x60, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d,
+  0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74,
+  0x20, 0x4d, 0x6f, 0x64, 0x65, 0x6c, 0x47, 0x65, 0x6e, 0x65, 0x72, 0x61,
+  0x74, 0x69, 0x6f, 0x6e, 0x49, 0x6e, 0x66, 0x6f, 0x20, 0x3d, 0x20, 0x28,
+  0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b,
+  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x21,
+  0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x53, 0x74, 0x61, 0x74, 0x73, 0x2e, 0x76,
+  0x61, 0x6c, 0x75, 0x65, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x68,
+  0x74, 0x6d, 0x6c, 0x60, 0x3c, 0x73, 0x70, 0x61, 0x6e, 0x2f, 0x3e, 0x60,
+  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x68, 0x74,
   0x6d, 0x6c, 0x60, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x3c, 0x24, 0x7b, 0x50, 0x6f, 0x72, 0x74, 0x61, 0x6c, 0x7d,
-  0x20, 0x69, 0x6e, 0x74, 0x6f, 0x3d, 0x22, 0x23, 0x70, 0x6f, 0x72, 0x74,
-  0x61, 0x6c, 0x22, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x64, 0x69, 0x76, 0x0a, 0x20, 0x20,
+  0x3c, 0x73, 0x70, 0x61, 0x6e, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x24, 0x7b, 0x6c, 0x6c, 0x61, 0x6d, 0x61,
+  0x53, 0x74, 0x61, 0x74, 0x73, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e,
+  0x70, 0x72, 0x65, 0x64, 0x69, 0x63, 0x74, 0x65, 0x64, 0x5f, 0x70, 0x65,
+  0x72, 0x5f, 0x74, 0x6f, 0x6b, 0x65, 0x6e, 0x5f, 0x6d, 0x73, 0x2e, 0x74,
+  0x6f, 0x46, 0x69, 0x78, 0x65, 0x64, 0x28, 0x29, 0x7d, 0x6d, 0x73, 0x20,
+  0x70, 0x65, 0x72, 0x20, 0x74, 0x6f, 0x6b, 0x65, 0x6e, 0x2c, 0x20, 0x24,
+  0x7b, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x53, 0x74, 0x61, 0x74, 0x73, 0x2e,
+  0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e, 0x70, 0x72, 0x65, 0x64, 0x69, 0x63,
+  0x74, 0x65, 0x64, 0x5f, 0x70, 0x65, 0x72, 0x5f, 0x73, 0x65, 0x63, 0x6f,
+  0x6e, 0x64, 0x2e, 0x74, 0x6f, 0x46, 0x69, 0x78, 0x65, 0x64, 0x28, 0x32,
+  0x29, 0x7d, 0x20, 0x74, 0x6f, 0x6b, 0x65, 0x6e, 0x73, 0x20, 0x70, 0x65,
+  0x72, 0x20, 0x73, 0x65, 0x63, 0x6f, 0x6e, 0x64, 0x0a, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x2f, 0x73, 0x70, 0x61, 0x6e, 0x3e,
+  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x60, 0x0a, 0x20, 0x20, 0x20,
+  0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f, 0x20, 0x73,
+  0x69, 0x6d, 0x70, 0x6c, 0x65, 0x20, 0x70, 0x6f, 0x70, 0x6f, 0x76, 0x65,
+  0x72, 0x20, 0x69, 0x6d, 0x70, 0x6c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63,
+  0x6f, 0x6e, 0x73, 0x74, 0x20, 0x50, 0x6f, 0x70, 0x6f, 0x76, 0x65, 0x72,
+  0x20, 0x3d, 0x20, 0x28, 0x70, 0x72, 0x6f, 0x70, 0x73, 0x29, 0x20, 0x3d,
+  0x3e, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f,
+  0x6e, 0x73, 0x74, 0x20, 0x69, 0x73, 0x4f, 0x70, 0x65, 0x6e, 0x20, 0x3d,
+  0x20, 0x75, 0x73, 0x65, 0x53, 0x69, 0x67, 0x6e, 0x61, 0x6c, 0x28, 0x66,
+  0x61, 0x6c, 0x73, 0x65, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x70, 0x6f, 0x73, 0x69, 0x74,
+  0x69, 0x6f, 0x6e, 0x20, 0x3d, 0x20, 0x75, 0x73, 0x65, 0x53, 0x69, 0x67,
+  0x6e, 0x61, 0x6c, 0x28, 0x7b, 0x20, 0x74, 0x6f, 0x70, 0x3a, 0x20, 0x27,
+  0x30, 0x70, 0x78, 0x27, 0x2c, 0x20, 0x6c, 0x65, 0x66, 0x74, 0x3a, 0x20,
+  0x27, 0x30, 0x70, 0x78, 0x27, 0x20, 0x7d, 0x29, 0x3b, 0x0a, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x62, 0x75,
+  0x74, 0x74, 0x6f, 0x6e, 0x52, 0x65, 0x66, 0x20, 0x3d, 0x20, 0x75, 0x73,
+  0x65, 0x52, 0x65, 0x66, 0x28, 0x6e, 0x75, 0x6c, 0x6c, 0x29, 0x3b, 0x0a,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20,
+  0x70, 0x6f, 0x70, 0x6f, 0x76, 0x65, 0x72, 0x52, 0x65, 0x66, 0x20, 0x3d,
+  0x20, 0x75, 0x73, 0x65, 0x52, 0x65, 0x66, 0x28, 0x6e, 0x75, 0x6c, 0x6c,
+  0x29, 0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f,
+  0x6e, 0x73, 0x74, 0x20, 0x74, 0x6f, 0x67, 0x67, 0x6c, 0x65, 0x50, 0x6f,
+  0x70, 0x6f, 0x76, 0x65, 0x72, 0x20, 0x3d, 0x20, 0x28, 0x29, 0x20, 0x3d,
+  0x3e, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x69, 0x66, 0x20, 0x28, 0x62, 0x75, 0x74, 0x74, 0x6f, 0x6e, 0x52, 0x65,
+  0x66, 0x2e, 0x63, 0x75, 0x72, 0x72, 0x65, 0x6e, 0x74, 0x29, 0x20, 0x7b,
+  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63,
+  0x6f, 0x6e, 0x73, 0x74, 0x20, 0x72, 0x65, 0x63, 0x74, 0x20, 0x3d, 0x20,
+  0x62, 0x75, 0x74, 0x74, 0x6f, 0x6e, 0x52, 0x65, 0x66, 0x2e, 0x63, 0x75,
+  0x72, 0x72, 0x65, 0x6e, 0x74, 0x2e, 0x67, 0x65, 0x74, 0x42, 0x6f, 0x75,
+  0x6e, 0x64, 0x69, 0x6e, 0x67, 0x43, 0x6c, 0x69, 0x65, 0x6e, 0x74, 0x52,
+  0x65, 0x63, 0x74, 0x28, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x70, 0x6f, 0x73, 0x69, 0x74, 0x69, 0x6f,
+  0x6e, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x3d, 0x20, 0x7b, 0x0a,
   0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x72, 0x65, 0x66, 0x3d, 0x24, 0x7b, 0x70, 0x6f, 0x70, 0x6f, 0x76, 0x65,
-  0x72, 0x52, 0x65, 0x66, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6c, 0x61, 0x73,
-  0x73, 0x3d, 0x22, 0x70, 0x6f, 0x70, 0x6f, 0x76, 0x65, 0x72, 0x2d, 0x63,
-  0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x22, 0x0a, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x73, 0x74,
-  0x79, 0x6c, 0x65, 0x3d, 0x24, 0x7b, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x74, 0x6f, 0x70, 0x3a, 0x20, 0x70, 0x6f, 0x73, 0x69, 0x74, 0x69, 0x6f,
-  0x6e, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e, 0x74, 0x6f, 0x70, 0x2c,
+  0x74, 0x6f, 0x70, 0x3a, 0x20, 0x60, 0x24, 0x7b, 0x72, 0x65, 0x63, 0x74,
+  0x2e, 0x62, 0x6f, 0x74, 0x74, 0x6f, 0x6d, 0x20, 0x2b, 0x20, 0x77, 0x69,
+  0x6e, 0x64, 0x6f, 0x77, 0x2e, 0x73, 0x63, 0x72, 0x6f, 0x6c, 0x6c, 0x59,
+  0x7d, 0x70, 0x78, 0x60, 0x2c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6c, 0x65, 0x66, 0x74, 0x3a, 0x20,
+  0x60, 0x24, 0x7b, 0x72, 0x65, 0x63, 0x74, 0x2e, 0x6c, 0x65, 0x66, 0x74,
+  0x20, 0x2b, 0x20, 0x77, 0x69, 0x6e, 0x64, 0x6f, 0x77, 0x2e, 0x73, 0x63,
+  0x72, 0x6f, 0x6c, 0x6c, 0x58, 0x7d, 0x70, 0x78, 0x60, 0x2c, 0x0a, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x3b, 0x0a,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x73, 0x4f, 0x70, 0x65, 0x6e,
+  0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x3d, 0x20, 0x21, 0x69, 0x73,
+  0x4f, 0x70, 0x65, 0x6e, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x3b, 0x0a,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x3b, 0x0a, 0x0a, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x68, 0x61,
+  0x6e, 0x64, 0x6c, 0x65, 0x43, 0x6c, 0x69, 0x63, 0x6b, 0x4f, 0x75, 0x74,
+  0x73, 0x69, 0x64, 0x65, 0x20, 0x3d, 0x20, 0x28, 0x65, 0x76, 0x65, 0x6e,
+  0x74, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x70, 0x6f, 0x70, 0x6f,
+  0x76, 0x65, 0x72, 0x52, 0x65, 0x66, 0x2e, 0x63, 0x75, 0x72, 0x72, 0x65,
+  0x6e, 0x74, 0x20, 0x26, 0x26, 0x20, 0x21, 0x70, 0x6f, 0x70, 0x6f, 0x76,
+  0x65, 0x72, 0x52, 0x65, 0x66, 0x2e, 0x63, 0x75, 0x72, 0x72, 0x65, 0x6e,
+  0x74, 0x2e, 0x63, 0x6f, 0x6e, 0x74, 0x61, 0x69, 0x6e, 0x73, 0x28, 0x65,
+  0x76, 0x65, 0x6e, 0x74, 0x2e, 0x74, 0x61, 0x72, 0x67, 0x65, 0x74, 0x29,
+  0x20, 0x26, 0x26, 0x20, 0x21, 0x62, 0x75, 0x74, 0x74, 0x6f, 0x6e, 0x52,
+  0x65, 0x66, 0x2e, 0x63, 0x75, 0x72, 0x72, 0x65, 0x6e, 0x74, 0x2e, 0x63,
+  0x6f, 0x6e, 0x74, 0x61, 0x69, 0x6e, 0x73, 0x28, 0x65, 0x76, 0x65, 0x6e,
+  0x74, 0x2e, 0x74, 0x61, 0x72, 0x67, 0x65, 0x74, 0x29, 0x29, 0x20, 0x7b,
+  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69,
+  0x73, 0x4f, 0x70, 0x65, 0x6e, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20,
+  0x3d, 0x20, 0x66, 0x61, 0x6c, 0x73, 0x65, 0x3b, 0x0a, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x7d, 0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x75,
+  0x73, 0x65, 0x45, 0x66, 0x66, 0x65, 0x63, 0x74, 0x28, 0x28, 0x29, 0x20,
+  0x3d, 0x3e, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x64, 0x6f, 0x63, 0x75, 0x6d, 0x65, 0x6e, 0x74, 0x2e, 0x61, 0x64,
+  0x64, 0x45, 0x76, 0x65, 0x6e, 0x74, 0x4c, 0x69, 0x73, 0x74, 0x65, 0x6e,
+  0x65, 0x72, 0x28, 0x27, 0x6d, 0x6f, 0x75, 0x73, 0x65, 0x64, 0x6f, 0x77,
+  0x6e, 0x27, 0x2c, 0x20, 0x68, 0x61, 0x6e, 0x64, 0x6c, 0x65, 0x43, 0x6c,
+  0x69, 0x63, 0x6b, 0x4f, 0x75, 0x74, 0x73, 0x69, 0x64, 0x65, 0x29, 0x3b,
+  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x74,
+  0x75, 0x72, 0x6e, 0x20, 0x28, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x64, 0x6f,
+  0x63, 0x75, 0x6d, 0x65, 0x6e, 0x74, 0x2e, 0x72, 0x65, 0x6d, 0x6f, 0x76,
+  0x65, 0x45, 0x76, 0x65, 0x6e, 0x74, 0x4c, 0x69, 0x73, 0x74, 0x65, 0x6e,
+  0x65, 0x72, 0x28, 0x27, 0x6d, 0x6f, 0x75, 0x73, 0x65, 0x64, 0x6f, 0x77,
+  0x6e, 0x27, 0x2c, 0x20, 0x68, 0x61, 0x6e, 0x64, 0x6c, 0x65, 0x43, 0x6c,
+  0x69, 0x63, 0x6b, 0x4f, 0x75, 0x74, 0x73, 0x69, 0x64, 0x65, 0x29, 0x3b,
+  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x3b, 0x0a,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x2c, 0x20, 0x5b, 0x5d, 0x29,
+  0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x74,
+  0x75, 0x72, 0x6e, 0x20, 0x68, 0x74, 0x6d, 0x6c, 0x60, 0x0a, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x73, 0x70, 0x61, 0x6e, 0x20,
+  0x73, 0x74, 0x79, 0x6c, 0x65, 0x3d, 0x24, 0x7b, 0x70, 0x72, 0x6f, 0x70,
+  0x73, 0x2e, 0x73, 0x74, 0x79, 0x6c, 0x65, 0x7d, 0x20, 0x72, 0x65, 0x66,
+  0x3d, 0x24, 0x7b, 0x62, 0x75, 0x74, 0x74, 0x6f, 0x6e, 0x52, 0x65, 0x66,
+  0x7d, 0x20, 0x6f, 0x6e, 0x43, 0x6c, 0x69, 0x63, 0x6b, 0x3d, 0x24, 0x7b,
+  0x74, 0x6f, 0x67, 0x67, 0x6c, 0x65, 0x50, 0x6f, 0x70, 0x6f, 0x76, 0x65,
+  0x72, 0x7d, 0x3e, 0x24, 0x7b, 0x70, 0x72, 0x6f, 0x70, 0x73, 0x2e, 0x63,
+  0x68, 0x69, 0x6c, 0x64, 0x72, 0x65, 0x6e, 0x7d, 0x3c, 0x2f, 0x73, 0x70,
+  0x61, 0x6e, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x24, 0x7b, 0x69, 0x73, 0x4f, 0x70, 0x65, 0x6e, 0x2e, 0x76, 0x61, 0x6c,
+  0x75, 0x65, 0x20, 0x26, 0x26, 0x20, 0x68, 0x74, 0x6d, 0x6c, 0x60, 0x0a,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x24,
+  0x7b, 0x50, 0x6f, 0x72, 0x74, 0x61, 0x6c, 0x7d, 0x20, 0x69, 0x6e, 0x74,
+  0x6f, 0x3d, 0x22, 0x23, 0x70, 0x6f, 0x72, 0x74, 0x61, 0x6c, 0x22, 0x3e,
   0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x3c, 0x64, 0x69, 0x76, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x66, 0x3d,
+  0x24, 0x7b, 0x70, 0x6f, 0x70, 0x6f, 0x76, 0x65, 0x72, 0x52, 0x65, 0x66,
+  0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x63, 0x6c, 0x61, 0x73, 0x73, 0x3d, 0x22, 0x70,
+  0x6f, 0x70, 0x6f, 0x76, 0x65, 0x72, 0x2d, 0x63, 0x6f, 0x6e, 0x74, 0x65,
+  0x6e, 0x74, 0x22, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x73, 0x74, 0x79, 0x6c, 0x65, 0x3d,
+  0x24, 0x7b, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x74, 0x6f, 0x70, 0x3a, 0x20, 0x70, 0x6f, 0x73,
+  0x69, 0x74, 0x69, 0x6f, 0x6e, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e,
+  0x74, 0x6f, 0x70, 0x2c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
   0x20, 0x20, 0x20, 0x20, 0x20, 0x6c, 0x65, 0x66, 0x74, 0x3a, 0x20, 0x70,
   0x6f, 0x73, 0x69, 0x74, 0x69, 0x6f, 0x6e, 0x2e, 0x76, 0x61, 0x6c, 0x75,
   0x65, 0x2e, 0x6c, 0x65, 0x66, 0x74, 0x2c, 0x0a, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x7d,
-  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x24, 0x7b, 0x70, 0x72, 0x6f, 0x70, 0x73,
-  0x2e, 0x70, 0x6f, 0x70, 0x6f, 0x76, 0x65, 0x72, 0x43, 0x68, 0x69, 0x6c,
-  0x64, 0x72, 0x65, 0x6e, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x2f, 0x64, 0x69, 0x76, 0x3e,
-  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c,
-  0x2f, 0x24, 0x7b, 0x50, 0x6f, 0x72, 0x74, 0x61, 0x6c, 0x7d, 0x3e, 0x0a,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x60, 0x7d, 0x0a, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x60, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20,
-  0x7d, 0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f, 0x20, 0x53,
-  0x6f, 0x75, 0x72, 0x63, 0x65, 0x3a, 0x20, 0x70, 0x72, 0x65, 0x61, 0x63,
-  0x74, 0x2d, 0x70, 0x6f, 0x72, 0x74, 0x61, 0x6c, 0x20, 0x28, 0x68, 0x74,
-  0x74, 0x70, 0x73, 0x3a, 0x2f, 0x2f, 0x67, 0x69, 0x74, 0x68, 0x75, 0x62,
-  0x2e, 0x63, 0x6f, 0x6d, 0x2f, 0x64, 0x65, 0x76, 0x65, 0x6c, 0x6f, 0x70,
-  0x69, 0x74, 0x2f, 0x70, 0x72, 0x65, 0x61, 0x63, 0x74, 0x2d, 0x70, 0x6f,
-  0x72, 0x74, 0x61, 0x6c, 0x2f, 0x62, 0x6c, 0x6f, 0x62, 0x2f, 0x6d, 0x61,
-  0x73, 0x74, 0x65, 0x72, 0x2f, 0x73, 0x72, 0x63, 0x2f, 0x70, 0x72, 0x65,
-  0x61, 0x63, 0x74, 0x2d, 0x70, 0x6f, 0x72, 0x74, 0x61, 0x6c, 0x2e, 0x6a,
-  0x73, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2a, 0x2a, 0x20, 0x52,
-  0x65, 0x64, 0x69, 0x72, 0x65, 0x63, 0x74, 0x20, 0x72, 0x65, 0x6e, 0x64,
-  0x65, 0x72, 0x69, 0x6e, 0x67, 0x20, 0x6f, 0x66, 0x20, 0x64, 0x65, 0x73,
-  0x63, 0x65, 0x6e, 0x64, 0x61, 0x6e, 0x74, 0x73, 0x20, 0x69, 0x6e, 0x74,
-  0x6f, 0x20, 0x74, 0x68, 0x65, 0x20, 0x67, 0x69, 0x76, 0x65, 0x6e, 0x20,
-  0x43, 0x53, 0x53, 0x20, 0x73, 0x65, 0x6c, 0x65, 0x63, 0x74, 0x6f, 0x72,
-  0x20, 0x2a, 0x2f, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6c, 0x61, 0x73,
-  0x73, 0x20, 0x50, 0x6f, 0x72, 0x74, 0x61, 0x6c, 0x20, 0x65, 0x78, 0x74,
-  0x65, 0x6e, 0x64, 0x73, 0x20, 0x43, 0x6f, 0x6d, 0x70, 0x6f, 0x6e, 0x65,
-  0x6e, 0x74, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63,
-  0x6f, 0x6d, 0x70, 0x6f, 0x6e, 0x65, 0x6e, 0x74, 0x44, 0x69, 0x64, 0x55,
-  0x70, 0x64, 0x61, 0x74, 0x65, 0x28, 0x70, 0x72, 0x6f, 0x70, 0x73, 0x29,
-  0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x66,
-  0x6f, 0x72, 0x20, 0x28, 0x6c, 0x65, 0x74, 0x20, 0x69, 0x20, 0x69, 0x6e,
-  0x20, 0x70, 0x72, 0x6f, 0x70, 0x73, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28,
-  0x70, 0x72, 0x6f, 0x70, 0x73, 0x5b, 0x69, 0x5d, 0x20, 0x21, 0x3d, 0x3d,
-  0x20, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x70, 0x72, 0x6f, 0x70, 0x73, 0x5b,
-  0x69, 0x5d, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e,
-  0x20, 0x73, 0x65, 0x74, 0x54, 0x69, 0x6d, 0x65, 0x6f, 0x75, 0x74, 0x28,
-  0x74, 0x68, 0x69, 0x73, 0x2e, 0x72, 0x65, 0x6e, 0x64, 0x65, 0x72, 0x4c,
-  0x61, 0x79, 0x65, 0x72, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d,
-  0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6d, 0x70,
-  0x6f, 0x6e, 0x65, 0x6e, 0x74, 0x44, 0x69, 0x64, 0x4d, 0x6f, 0x75, 0x6e,
-  0x74, 0x28, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x69, 0x73, 0x4d, 0x6f, 0x75,
-  0x6e, 0x74, 0x65, 0x64, 0x20, 0x3d, 0x20, 0x74, 0x72, 0x75, 0x65, 0x3b,
-  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x74, 0x68, 0x69,
-  0x73, 0x2e, 0x72, 0x65, 0x6e, 0x64, 0x65, 0x72, 0x4c, 0x61, 0x79, 0x65,
-  0x72, 0x20, 0x3d, 0x20, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x72, 0x65, 0x6e,
-  0x64, 0x65, 0x72, 0x4c, 0x61, 0x79, 0x65, 0x72, 0x2e, 0x62, 0x69, 0x6e,
-  0x64, 0x28, 0x74, 0x68, 0x69, 0x73, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x72, 0x65,
-  0x6e, 0x64, 0x65, 0x72, 0x4c, 0x61, 0x79, 0x65, 0x72, 0x28, 0x29, 0x3b,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x7d, 0x0a, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3e, 0x0a, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x24, 0x7b, 0x70, 0x72, 0x6f, 0x70, 0x73, 0x2e, 0x70, 0x6f, 0x70,
+  0x6f, 0x76, 0x65, 0x72, 0x43, 0x68, 0x69, 0x6c, 0x64, 0x72, 0x65, 0x6e,
+  0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x3c, 0x2f, 0x64, 0x69, 0x76, 0x3e, 0x0a, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x2f, 0x24, 0x7b, 0x50,
+  0x6f, 0x72, 0x74, 0x61, 0x6c, 0x7d, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x60, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x60, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x3b, 0x0a, 0x0a,
+  0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f, 0x20, 0x53, 0x6f, 0x75, 0x72, 0x63,
+  0x65, 0x3a, 0x20, 0x70, 0x72, 0x65, 0x61, 0x63, 0x74, 0x2d, 0x70, 0x6f,
+  0x72, 0x74, 0x61, 0x6c, 0x20, 0x28, 0x68, 0x74, 0x74, 0x70, 0x73, 0x3a,
+  0x2f, 0x2f, 0x67, 0x69, 0x74, 0x68, 0x75, 0x62, 0x2e, 0x63, 0x6f, 0x6d,
+  0x2f, 0x64, 0x65, 0x76, 0x65, 0x6c, 0x6f, 0x70, 0x69, 0x74, 0x2f, 0x70,
+  0x72, 0x65, 0x61, 0x63, 0x74, 0x2d, 0x70, 0x6f, 0x72, 0x74, 0x61, 0x6c,
+  0x2f, 0x62, 0x6c, 0x6f, 0x62, 0x2f, 0x6d, 0x61, 0x73, 0x74, 0x65, 0x72,
+  0x2f, 0x73, 0x72, 0x63, 0x2f, 0x70, 0x72, 0x65, 0x61, 0x63, 0x74, 0x2d,
+  0x70, 0x6f, 0x72, 0x74, 0x61, 0x6c, 0x2e, 0x6a, 0x73, 0x29, 0x0a, 0x20,
+  0x20, 0x20, 0x20, 0x2f, 0x2a, 0x2a, 0x20, 0x52, 0x65, 0x64, 0x69, 0x72,
+  0x65, 0x63, 0x74, 0x20, 0x72, 0x65, 0x6e, 0x64, 0x65, 0x72, 0x69, 0x6e,
+  0x67, 0x20, 0x6f, 0x66, 0x20, 0x64, 0x65, 0x73, 0x63, 0x65, 0x6e, 0x64,
+  0x61, 0x6e, 0x74, 0x73, 0x20, 0x69, 0x6e, 0x74, 0x6f, 0x20, 0x74, 0x68,
+  0x65, 0x20, 0x67, 0x69, 0x76, 0x65, 0x6e, 0x20, 0x43, 0x53, 0x53, 0x20,
+  0x73, 0x65, 0x6c, 0x65, 0x63, 0x74, 0x6f, 0x72, 0x20, 0x2a, 0x2f, 0x0a,
+  0x20, 0x20, 0x20, 0x20, 0x63, 0x6c, 0x61, 0x73, 0x73, 0x20, 0x50, 0x6f,
+  0x72, 0x74, 0x61, 0x6c, 0x20, 0x65, 0x78, 0x74, 0x65, 0x6e, 0x64, 0x73,
+  0x20, 0x43, 0x6f, 0x6d, 0x70, 0x6f, 0x6e, 0x65, 0x6e, 0x74, 0x20, 0x7b,
+  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6d, 0x70, 0x6f,
+  0x6e, 0x65, 0x6e, 0x74, 0x44, 0x69, 0x64, 0x55, 0x70, 0x64, 0x61, 0x74,
+  0x65, 0x28, 0x70, 0x72, 0x6f, 0x70, 0x73, 0x29, 0x20, 0x7b, 0x0a, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x66, 0x6f, 0x72, 0x20, 0x28,
+  0x6c, 0x65, 0x74, 0x20, 0x69, 0x20, 0x69, 0x6e, 0x20, 0x70, 0x72, 0x6f,
+  0x70, 0x73, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x70, 0x72, 0x6f, 0x70,
+  0x73, 0x5b, 0x69, 0x5d, 0x20, 0x21, 0x3d, 0x3d, 0x20, 0x74, 0x68, 0x69,
+  0x73, 0x2e, 0x70, 0x72, 0x6f, 0x70, 0x73, 0x5b, 0x69, 0x5d, 0x29, 0x20,
+  0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x73, 0x65, 0x74,
+  0x54, 0x69, 0x6d, 0x65, 0x6f, 0x75, 0x74, 0x28, 0x74, 0x68, 0x69, 0x73,
+  0x2e, 0x72, 0x65, 0x6e, 0x64, 0x65, 0x72, 0x4c, 0x61, 0x79, 0x65, 0x72,
+  0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d,
   0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20,
   0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6d, 0x70, 0x6f, 0x6e, 0x65, 0x6e,
-  0x74, 0x57, 0x69, 0x6c, 0x6c, 0x55, 0x6e, 0x6d, 0x6f, 0x75, 0x6e, 0x74,
-  0x28, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x72, 0x65, 0x6e, 0x64, 0x65, 0x72,
-  0x4c, 0x61, 0x79, 0x65, 0x72, 0x28, 0x66, 0x61, 0x6c, 0x73, 0x65, 0x29,
-  0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x74, 0x68,
+  0x74, 0x44, 0x69, 0x64, 0x4d, 0x6f, 0x75, 0x6e, 0x74, 0x28, 0x29, 0x20,
+  0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x74, 0x68,
   0x69, 0x73, 0x2e, 0x69, 0x73, 0x4d, 0x6f, 0x75, 0x6e, 0x74, 0x65, 0x64,
-  0x20, 0x3d, 0x20, 0x66, 0x61, 0x6c, 0x73, 0x65, 0x3b, 0x0a, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x74, 0x68,
-  0x69, 0x73, 0x2e, 0x72, 0x65, 0x6d, 0x6f, 0x74, 0x65, 0x20, 0x26, 0x26,
-  0x20, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x72, 0x65, 0x6d, 0x6f, 0x74, 0x65,
-  0x2e, 0x70, 0x61, 0x72, 0x65, 0x6e, 0x74, 0x4e, 0x6f, 0x64, 0x65, 0x29,
-  0x20, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x72, 0x65, 0x6d, 0x6f, 0x74, 0x65,
-  0x2e, 0x70, 0x61, 0x72, 0x65, 0x6e, 0x74, 0x4e, 0x6f, 0x64, 0x65, 0x2e,
-  0x72, 0x65, 0x6d, 0x6f, 0x76, 0x65, 0x43, 0x68, 0x69, 0x6c, 0x64, 0x28,
-  0x74, 0x68, 0x69, 0x73, 0x2e, 0x72, 0x65, 0x6d, 0x6f, 0x74, 0x65, 0x29,
-  0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x66, 0x69, 0x6e, 0x64, 0x4e, 0x6f, 0x64,
-  0x65, 0x28, 0x6e, 0x6f, 0x64, 0x65, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e,
-  0x20, 0x74, 0x79, 0x70, 0x65, 0x6f, 0x66, 0x20, 0x6e, 0x6f, 0x64, 0x65,
-  0x20, 0x3d, 0x3d, 0x3d, 0x20, 0x27, 0x73, 0x74, 0x72, 0x69, 0x6e, 0x67,
-  0x27, 0x20, 0x3f, 0x20, 0x64, 0x6f, 0x63, 0x75, 0x6d, 0x65, 0x6e, 0x74,
-  0x2e, 0x71, 0x75, 0x65, 0x72, 0x79, 0x53, 0x65, 0x6c, 0x65, 0x63, 0x74,
-  0x6f, 0x72, 0x28, 0x6e, 0x6f, 0x64, 0x65, 0x29, 0x20, 0x3a, 0x20, 0x6e,
-  0x6f, 0x64, 0x65, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d,
-  0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x6e, 0x64,
-  0x65, 0x72, 0x4c, 0x61, 0x79, 0x65, 0x72, 0x28, 0x73, 0x68, 0x6f, 0x77,
-  0x20, 0x3d, 0x20, 0x74, 0x72, 0x75, 0x65, 0x29, 0x20, 0x7b, 0x0a, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x21,
-  0x74, 0x68, 0x69, 0x73, 0x2e, 0x69, 0x73, 0x4d, 0x6f, 0x75, 0x6e, 0x74,
-  0x65, 0x64, 0x29, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x3b, 0x0a,
-  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f, 0x20,
-  0x63, 0x6c, 0x65, 0x61, 0x6e, 0x20, 0x75, 0x70, 0x20, 0x6f, 0x6c, 0x64,
-  0x20, 0x6e, 0x6f, 0x64, 0x65, 0x20, 0x69, 0x66, 0x20, 0x6d, 0x6f, 0x76,
-  0x69, 0x6e, 0x67, 0x20, 0x62, 0x61, 0x73, 0x65, 0x73, 0x3a, 0x0a, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x74,
-  0x68, 0x69, 0x73, 0x2e, 0x70, 0x72, 0x6f, 0x70, 0x73, 0x2e, 0x69, 0x6e,
-  0x74, 0x6f, 0x20, 0x21, 0x3d, 0x3d, 0x20, 0x74, 0x68, 0x69, 0x73, 0x2e,
-  0x69, 0x6e, 0x74, 0x6f, 0x50, 0x6f, 0x69, 0x6e, 0x74, 0x65, 0x72, 0x29,
-  0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x69, 0x6e, 0x74, 0x6f, 0x50, 0x6f,
-  0x69, 0x6e, 0x74, 0x65, 0x72, 0x20, 0x3d, 0x20, 0x74, 0x68, 0x69, 0x73,
-  0x2e, 0x70, 0x72, 0x6f, 0x70, 0x73, 0x2e, 0x69, 0x6e, 0x74, 0x6f, 0x3b,
-  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69,
-  0x66, 0x20, 0x28, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x69, 0x6e, 0x74, 0x6f,
-  0x20, 0x26, 0x26, 0x20, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x72, 0x65, 0x6d,
-  0x6f, 0x74, 0x65, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x74, 0x68, 0x69, 0x73, 0x2e,
-  0x72, 0x65, 0x6d, 0x6f, 0x74, 0x65, 0x20, 0x3d, 0x20, 0x72, 0x65, 0x6e,
-  0x64, 0x65, 0x72, 0x28, 0x68, 0x74, 0x6d, 0x6c, 0x60, 0x3c, 0x24, 0x7b,
-  0x50, 0x6f, 0x72, 0x74, 0x61, 0x6c, 0x50, 0x72, 0x6f, 0x78, 0x79, 0x7d,
-  0x20, 0x2f, 0x3e, 0x60, 0x2c, 0x20, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x69,
-  0x6e, 0x74, 0x6f, 0x2c, 0x20, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x72, 0x65,
-  0x6d, 0x6f, 0x74, 0x65, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x69, 0x6e,
-  0x74, 0x6f, 0x20, 0x3d, 0x20, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x66, 0x69,
-  0x6e, 0x64, 0x4e, 0x6f, 0x64, 0x65, 0x28, 0x74, 0x68, 0x69, 0x73, 0x2e,
-  0x70, 0x72, 0x6f, 0x70, 0x73, 0x2e, 0x69, 0x6e, 0x74, 0x6f, 0x29, 0x3b,
-  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x74, 0x68, 0x69, 0x73,
-  0x2e, 0x72, 0x65, 0x6d, 0x6f, 0x74, 0x65, 0x20, 0x3d, 0x20, 0x72, 0x65,
-  0x6e, 0x64, 0x65, 0x72, 0x28, 0x68, 0x74, 0x6d, 0x6c, 0x60, 0x0a, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x24, 0x7b,
-  0x50, 0x6f, 0x72, 0x74, 0x61, 0x6c, 0x50, 0x72, 0x6f, 0x78, 0x79, 0x7d,
-  0x20, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x78, 0x74, 0x3d, 0x24, 0x7b, 0x74,
-  0x68, 0x69, 0x73, 0x2e, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x78, 0x74, 0x7d,
-  0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x24, 0x7b, 0x73, 0x68, 0x6f, 0x77, 0x20, 0x26, 0x26, 0x20,
-  0x74, 0x68, 0x69, 0x73, 0x2e, 0x70, 0x72, 0x6f, 0x70, 0x73, 0x2e, 0x63,
-  0x68, 0x69, 0x6c, 0x64, 0x72, 0x65, 0x6e, 0x20, 0x7c, 0x7c, 0x20, 0x6e,
-  0x75, 0x6c, 0x6c, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x3c, 0x2f, 0x24, 0x7b, 0x50, 0x6f, 0x72, 0x74, 0x61,
-  0x6c, 0x50, 0x72, 0x6f, 0x78, 0x79, 0x7d, 0x3e, 0x0a, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x60, 0x2c, 0x20, 0x74, 0x68, 0x69, 0x73,
-  0x2e, 0x69, 0x6e, 0x74, 0x6f, 0x2c, 0x20, 0x74, 0x68, 0x69, 0x73, 0x2e,
-  0x72, 0x65, 0x6d, 0x6f, 0x74, 0x65, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20,
+  0x20, 0x3d, 0x20, 0x74, 0x72, 0x75, 0x65, 0x3b, 0x0a, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x72, 0x65,
+  0x6e, 0x64, 0x65, 0x72, 0x4c, 0x61, 0x79, 0x65, 0x72, 0x20, 0x3d, 0x20,
+  0x74, 0x68, 0x69, 0x73, 0x2e, 0x72, 0x65, 0x6e, 0x64, 0x65, 0x72, 0x4c,
+  0x61, 0x79, 0x65, 0x72, 0x2e, 0x62, 0x69, 0x6e, 0x64, 0x28, 0x74, 0x68,
+  0x69, 0x73, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x72, 0x65, 0x6e, 0x64, 0x65, 0x72,
+  0x4c, 0x61, 0x79, 0x65, 0x72, 0x28, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20,
   0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x72, 0x65, 0x6e, 0x64, 0x65, 0x72, 0x28, 0x29, 0x20, 0x7b, 0x0a, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72,
-  0x6e, 0x20, 0x6e, 0x75, 0x6c, 0x6c, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20,
-  0x20, 0x20, 0x2f, 0x2f, 0x20, 0x68, 0x69, 0x67, 0x68, 0x2d, 0x6f, 0x72,
-  0x64, 0x65, 0x72, 0x20, 0x63, 0x6f, 0x6d, 0x70, 0x6f, 0x6e, 0x65, 0x6e,
-  0x74, 0x20, 0x74, 0x68, 0x61, 0x74, 0x20, 0x72, 0x65, 0x6e, 0x64, 0x65,
-  0x72, 0x73, 0x20, 0x69, 0x74, 0x73, 0x20, 0x66, 0x69, 0x72, 0x73, 0x74,
-  0x20, 0x63, 0x68, 0x69, 0x6c, 0x64, 0x20, 0x69, 0x66, 0x20, 0x69, 0x74,
-  0x20, 0x65, 0x78, 0x69, 0x73, 0x74, 0x73, 0x2e, 0x0a, 0x20, 0x20, 0x20,
-  0x20, 0x2f, 0x2f, 0x20, 0x75, 0x73, 0x65, 0x64, 0x20, 0x61, 0x73, 0x20,
-  0x61, 0x20, 0x63, 0x6f, 0x6e, 0x64, 0x69, 0x74, 0x69, 0x6f, 0x6e, 0x61,
-  0x6c, 0x20, 0x72, 0x65, 0x6e, 0x64, 0x65, 0x72, 0x69, 0x6e, 0x67, 0x20,
-  0x70, 0x72, 0x6f, 0x78, 0x79, 0x2e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63,
-  0x6c, 0x61, 0x73, 0x73, 0x20, 0x50, 0x6f, 0x72, 0x74, 0x61, 0x6c, 0x50,
-  0x72, 0x6f, 0x78, 0x79, 0x20, 0x65, 0x78, 0x74, 0x65, 0x6e, 0x64, 0x73,
-  0x20, 0x43, 0x6f, 0x6d, 0x70, 0x6f, 0x6e, 0x65, 0x6e, 0x74, 0x20, 0x7b,
-  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x67, 0x65, 0x74, 0x43, 0x68,
-  0x69, 0x6c, 0x64, 0x43, 0x6f, 0x6e, 0x74, 0x65, 0x78, 0x74, 0x28, 0x29,
-  0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72,
-  0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x70,
-  0x72, 0x6f, 0x70, 0x73, 0x2e, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x78, 0x74,
-  0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x6e, 0x64, 0x65, 0x72, 0x28, 0x7b,
-  0x20, 0x63, 0x68, 0x69, 0x6c, 0x64, 0x72, 0x65, 0x6e, 0x20, 0x7d, 0x29,
-  0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72,
-  0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x63, 0x68, 0x69, 0x6c, 0x64, 0x72,
-  0x65, 0x6e, 0x20, 0x7c, 0x7c, 0x20, 0x6e, 0x75, 0x6c, 0x6c, 0x3b, 0x0a,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20,
-  0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x66, 0x75, 0x6e, 0x63, 0x74,
-  0x69, 0x6f, 0x6e, 0x20, 0x41, 0x70, 0x70, 0x28, 0x70, 0x72, 0x6f, 0x70,
-  0x73, 0x29, 0x20, 0x7b, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x68, 0x74, 0x6d, 0x6c, 0x60,
-  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x64, 0x69,
-  0x76, 0x20, 0x63, 0x6c, 0x61, 0x73, 0x73, 0x3d, 0x22, 0x6d, 0x6f, 0x64,
-  0x65, 0x2d, 0x24, 0x7b, 0x73, 0x65, 0x73, 0x73, 0x69, 0x6f, 0x6e, 0x2e,
-  0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e, 0x74, 0x79, 0x70, 0x65, 0x7d, 0x22,
-  0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x3c, 0x68, 0x65, 0x61, 0x64, 0x65, 0x72, 0x3e, 0x0a, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x68, 0x31,
-  0x3e, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x2e, 0x63, 0x70, 0x70, 0x3c, 0x2f,
-  0x68, 0x31, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x3c, 0x2f, 0x68, 0x65, 0x61, 0x64, 0x65, 0x72, 0x3e, 0x0a,
+  0x63, 0x6f, 0x6d, 0x70, 0x6f, 0x6e, 0x65, 0x6e, 0x74, 0x57, 0x69, 0x6c,
+  0x6c, 0x55, 0x6e, 0x6d, 0x6f, 0x75, 0x6e, 0x74, 0x28, 0x29, 0x20, 0x7b,
+  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x74, 0x68, 0x69,
+  0x73, 0x2e, 0x72, 0x65, 0x6e, 0x64, 0x65, 0x72, 0x4c, 0x61, 0x79, 0x65,
+  0x72, 0x28, 0x66, 0x61, 0x6c, 0x73, 0x65, 0x29, 0x3b, 0x0a, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x69,
+  0x73, 0x4d, 0x6f, 0x75, 0x6e, 0x74, 0x65, 0x64, 0x20, 0x3d, 0x20, 0x66,
+  0x61, 0x6c, 0x73, 0x65, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x72,
+  0x65, 0x6d, 0x6f, 0x74, 0x65, 0x20, 0x26, 0x26, 0x20, 0x74, 0x68, 0x69,
+  0x73, 0x2e, 0x72, 0x65, 0x6d, 0x6f, 0x74, 0x65, 0x2e, 0x70, 0x61, 0x72,
+  0x65, 0x6e, 0x74, 0x4e, 0x6f, 0x64, 0x65, 0x29, 0x20, 0x74, 0x68, 0x69,
+  0x73, 0x2e, 0x72, 0x65, 0x6d, 0x6f, 0x74, 0x65, 0x2e, 0x70, 0x61, 0x72,
+  0x65, 0x6e, 0x74, 0x4e, 0x6f, 0x64, 0x65, 0x2e, 0x72, 0x65, 0x6d, 0x6f,
+  0x76, 0x65, 0x43, 0x68, 0x69, 0x6c, 0x64, 0x28, 0x74, 0x68, 0x69, 0x73,
+  0x2e, 0x72, 0x65, 0x6d, 0x6f, 0x74, 0x65, 0x29, 0x3b, 0x0a, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x66, 0x69, 0x6e, 0x64, 0x4e, 0x6f, 0x64, 0x65, 0x28, 0x6e, 0x6f,
+  0x64, 0x65, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x74, 0x79, 0x70,
+  0x65, 0x6f, 0x66, 0x20, 0x6e, 0x6f, 0x64, 0x65, 0x20, 0x3d, 0x3d, 0x3d,
+  0x20, 0x27, 0x73, 0x74, 0x72, 0x69, 0x6e, 0x67, 0x27, 0x20, 0x3f, 0x20,
+  0x64, 0x6f, 0x63, 0x75, 0x6d, 0x65, 0x6e, 0x74, 0x2e, 0x71, 0x75, 0x65,
+  0x72, 0x79, 0x53, 0x65, 0x6c, 0x65, 0x63, 0x74, 0x6f, 0x72, 0x28, 0x6e,
+  0x6f, 0x64, 0x65, 0x29, 0x20, 0x3a, 0x20, 0x6e, 0x6f, 0x64, 0x65, 0x3b,
+  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x6e, 0x64, 0x65, 0x72, 0x4c, 0x61,
+  0x79, 0x65, 0x72, 0x28, 0x73, 0x68, 0x6f, 0x77, 0x20, 0x3d, 0x20, 0x74,
+  0x72, 0x75, 0x65, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x21, 0x74, 0x68, 0x69, 0x73,
+  0x2e, 0x69, 0x73, 0x4d, 0x6f, 0x75, 0x6e, 0x74, 0x65, 0x64, 0x29, 0x20,
+  0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f, 0x20, 0x63, 0x6c, 0x65, 0x61,
+  0x6e, 0x20, 0x75, 0x70, 0x20, 0x6f, 0x6c, 0x64, 0x20, 0x6e, 0x6f, 0x64,
+  0x65, 0x20, 0x69, 0x66, 0x20, 0x6d, 0x6f, 0x76, 0x69, 0x6e, 0x67, 0x20,
+  0x62, 0x61, 0x73, 0x65, 0x73, 0x3a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x74, 0x68, 0x69, 0x73, 0x2e,
+  0x70, 0x72, 0x6f, 0x70, 0x73, 0x2e, 0x69, 0x6e, 0x74, 0x6f, 0x20, 0x21,
+  0x3d, 0x3d, 0x20, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x69, 0x6e, 0x74, 0x6f,
+  0x50, 0x6f, 0x69, 0x6e, 0x74, 0x65, 0x72, 0x29, 0x20, 0x7b, 0x0a, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x74, 0x68, 0x69,
+  0x73, 0x2e, 0x69, 0x6e, 0x74, 0x6f, 0x50, 0x6f, 0x69, 0x6e, 0x74, 0x65,
+  0x72, 0x20, 0x3d, 0x20, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x70, 0x72, 0x6f,
+  0x70, 0x73, 0x2e, 0x69, 0x6e, 0x74, 0x6f, 0x3b, 0x0a, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x74,
+  0x68, 0x69, 0x73, 0x2e, 0x69, 0x6e, 0x74, 0x6f, 0x20, 0x26, 0x26, 0x20,
+  0x74, 0x68, 0x69, 0x73, 0x2e, 0x72, 0x65, 0x6d, 0x6f, 0x74, 0x65, 0x29,
+  0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x72, 0x65, 0x6d, 0x6f,
+  0x74, 0x65, 0x20, 0x3d, 0x20, 0x72, 0x65, 0x6e, 0x64, 0x65, 0x72, 0x28,
+  0x68, 0x74, 0x6d, 0x6c, 0x60, 0x3c, 0x24, 0x7b, 0x50, 0x6f, 0x72, 0x74,
+  0x61, 0x6c, 0x50, 0x72, 0x6f, 0x78, 0x79, 0x7d, 0x20, 0x2f, 0x3e, 0x60,
+  0x2c, 0x20, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x69, 0x6e, 0x74, 0x6f, 0x2c,
+  0x20, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x72, 0x65, 0x6d, 0x6f, 0x74, 0x65,
+  0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x69, 0x6e, 0x74, 0x6f, 0x20, 0x3d,
+  0x20, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x66, 0x69, 0x6e, 0x64, 0x4e, 0x6f,
+  0x64, 0x65, 0x28, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x70, 0x72, 0x6f, 0x70,
+  0x73, 0x2e, 0x69, 0x6e, 0x74, 0x6f, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x72, 0x65, 0x6d,
+  0x6f, 0x74, 0x65, 0x20, 0x3d, 0x20, 0x72, 0x65, 0x6e, 0x64, 0x65, 0x72,
+  0x28, 0x68, 0x74, 0x6d, 0x6c, 0x60, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x24, 0x7b, 0x50, 0x6f, 0x72, 0x74,
+  0x61, 0x6c, 0x50, 0x72, 0x6f, 0x78, 0x79, 0x7d, 0x20, 0x63, 0x6f, 0x6e,
+  0x74, 0x65, 0x78, 0x74, 0x3d, 0x24, 0x7b, 0x74, 0x68, 0x69, 0x73, 0x2e,
+  0x63, 0x6f, 0x6e, 0x74, 0x65, 0x78, 0x74, 0x7d, 0x3e, 0x0a, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x24, 0x7b,
+  0x73, 0x68, 0x6f, 0x77, 0x20, 0x26, 0x26, 0x20, 0x74, 0x68, 0x69, 0x73,
+  0x2e, 0x70, 0x72, 0x6f, 0x70, 0x73, 0x2e, 0x63, 0x68, 0x69, 0x6c, 0x64,
+  0x72, 0x65, 0x6e, 0x20, 0x7c, 0x7c, 0x20, 0x6e, 0x75, 0x6c, 0x6c, 0x7d,
   0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c,
-  0x6d, 0x61, 0x69, 0x6e, 0x20, 0x69, 0x64, 0x3d, 0x22, 0x63, 0x6f, 0x6e,
-  0x74, 0x65, 0x6e, 0x74, 0x22, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x24, 0x7b, 0x63, 0x68,
-  0x61, 0x74, 0x53, 0x74, 0x61, 0x72, 0x74, 0x65, 0x64, 0x2e, 0x76, 0x61,
-  0x6c, 0x75, 0x65, 0x20, 0x3f, 0x20, 0x43, 0x68, 0x61, 0x74, 0x4c, 0x6f,
-  0x67, 0x20, 0x3a, 0x20, 0x43, 0x6f, 0x6e, 0x66, 0x69, 0x67, 0x46, 0x6f,
-  0x72, 0x6d, 0x7d, 0x20, 0x2f, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x2f, 0x6d, 0x61, 0x69, 0x6e, 0x3e,
-  0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x3c, 0x73, 0x65, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x69, 0x64, 0x3d,
-  0x22, 0x77, 0x72, 0x69, 0x74, 0x65, 0x22, 0x3e, 0x0a, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x24, 0x7b,
+  0x2f, 0x24, 0x7b, 0x50, 0x6f, 0x72, 0x74, 0x61, 0x6c, 0x50, 0x72, 0x6f,
+  0x78, 0x79, 0x7d, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x60, 0x2c, 0x20, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x69, 0x6e, 0x74,
+  0x6f, 0x2c, 0x20, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x72, 0x65, 0x6d, 0x6f,
+  0x74, 0x65, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d,
+  0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x6e, 0x64,
+  0x65, 0x72, 0x28, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x6e, 0x75,
+  0x6c, 0x6c, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a,
+  0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f,
+  0x20, 0x68, 0x69, 0x67, 0x68, 0x2d, 0x6f, 0x72, 0x64, 0x65, 0x72, 0x20,
+  0x63, 0x6f, 0x6d, 0x70, 0x6f, 0x6e, 0x65, 0x6e, 0x74, 0x20, 0x74, 0x68,
+  0x61, 0x74, 0x20, 0x72, 0x65, 0x6e, 0x64, 0x65, 0x72, 0x73, 0x20, 0x69,
+  0x74, 0x73, 0x20, 0x66, 0x69, 0x72, 0x73, 0x74, 0x20, 0x63, 0x68, 0x69,
+  0x6c, 0x64, 0x20, 0x69, 0x66, 0x20, 0x69, 0x74, 0x20, 0x65, 0x78, 0x69,
+  0x73, 0x74, 0x73, 0x2e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f, 0x20,
+  0x75, 0x73, 0x65, 0x64, 0x20, 0x61, 0x73, 0x20, 0x61, 0x20, 0x63, 0x6f,
+  0x6e, 0x64, 0x69, 0x74, 0x69, 0x6f, 0x6e, 0x61, 0x6c, 0x20, 0x72, 0x65,
+  0x6e, 0x64, 0x65, 0x72, 0x69, 0x6e, 0x67, 0x20, 0x70, 0x72, 0x6f, 0x78,
+  0x79, 0x2e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6c, 0x61, 0x73, 0x73,
+  0x20, 0x50, 0x6f, 0x72, 0x74, 0x61, 0x6c, 0x50, 0x72, 0x6f, 0x78, 0x79,
+  0x20, 0x65, 0x78, 0x74, 0x65, 0x6e, 0x64, 0x73, 0x20, 0x43, 0x6f, 0x6d,
+  0x70, 0x6f, 0x6e, 0x65, 0x6e, 0x74, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x67, 0x65, 0x74, 0x43, 0x68, 0x69, 0x6c, 0x64, 0x43,
+  0x6f, 0x6e, 0x74, 0x65, 0x78, 0x74, 0x28, 0x29, 0x20, 0x7b, 0x0a, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72,
+  0x6e, 0x20, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x70, 0x72, 0x6f, 0x70, 0x73,
+  0x2e, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x78, 0x74, 0x3b, 0x0a, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x72, 0x65, 0x6e, 0x64, 0x65, 0x72, 0x28, 0x7b, 0x20, 0x63, 0x68, 0x69,
+  0x6c, 0x64, 0x72, 0x65, 0x6e, 0x20, 0x7d, 0x29, 0x20, 0x7b, 0x0a, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72,
+  0x6e, 0x20, 0x63, 0x68, 0x69, 0x6c, 0x64, 0x72, 0x65, 0x6e, 0x20, 0x7c,
+  0x7c, 0x20, 0x6e, 0x75, 0x6c, 0x6c, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20,
+  0x20, 0x20, 0x20, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20,
+  0x41, 0x70, 0x70, 0x28, 0x70, 0x72, 0x6f, 0x70, 0x73, 0x29, 0x20, 0x7b,
+  0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75,
+  0x72, 0x6e, 0x20, 0x68, 0x74, 0x6d, 0x6c, 0x60, 0x0a, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x64, 0x69, 0x76, 0x20, 0x63, 0x6c,
+  0x61, 0x73, 0x73, 0x3d, 0x22, 0x6d, 0x6f, 0x64, 0x65, 0x2d, 0x24, 0x7b,
   0x73, 0x65, 0x73, 0x73, 0x69, 0x6f, 0x6e, 0x2e, 0x76, 0x61, 0x6c, 0x75,
-  0x65, 0x2e, 0x74, 0x79, 0x70, 0x65, 0x20, 0x3d, 0x3d, 0x3d, 0x20, 0x27,
-  0x63, 0x68, 0x61, 0x74, 0x27, 0x20, 0x3f, 0x20, 0x4d, 0x65, 0x73, 0x73,
-  0x61, 0x67, 0x65, 0x49, 0x6e, 0x70, 0x75, 0x74, 0x20, 0x3a, 0x20, 0x43,
-  0x6f, 0x6d, 0x70, 0x6c, 0x65, 0x74, 0x69, 0x6f, 0x6e, 0x43, 0x6f, 0x6e,
-  0x74, 0x72, 0x6f, 0x6c, 0x73, 0x7d, 0x20, 0x2f, 0x3e, 0x0a, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x2f, 0x73, 0x65,
-  0x63, 0x74, 0x69, 0x6f, 0x6e, 0x3e, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x66, 0x6f, 0x6f, 0x74, 0x65,
-  0x72, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x3c, 0x70, 0x3e, 0x3c, 0x24, 0x7b, 0x4d, 0x6f, 0x64,
-  0x65, 0x6c, 0x47, 0x65, 0x6e, 0x65, 0x72, 0x61, 0x74, 0x69, 0x6f, 0x6e,
-  0x49, 0x6e, 0x66, 0x6f, 0x7d, 0x20, 0x2f, 0x3e, 0x3c, 0x2f, 0x70, 0x3e,
-  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x3c, 0x70, 0x3e, 0x50, 0x6f, 0x77, 0x65, 0x72, 0x65, 0x64, 0x20,
-  0x62, 0x79, 0x20, 0x3c, 0x61, 0x20, 0x68, 0x72, 0x65, 0x66, 0x3d, 0x22,
-  0x68, 0x74, 0x74, 0x70, 0x73, 0x3a, 0x2f, 0x2f, 0x67, 0x69, 0x74, 0x68,
-  0x75, 0x62, 0x2e, 0x63, 0x6f, 0x6d, 0x2f, 0x67, 0x67, 0x65, 0x72, 0x67,
-  0x61, 0x6e, 0x6f, 0x76, 0x2f, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x2e, 0x63,
-  0x70, 0x70, 0x22, 0x3e, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x2e, 0x63, 0x70,
-  0x70, 0x3c, 0x2f, 0x61, 0x3e, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x3c, 0x61,
-  0x20, 0x68, 0x72, 0x65, 0x66, 0x3d, 0x22, 0x68, 0x74, 0x74, 0x70, 0x73,
-  0x3a, 0x2f, 0x2f, 0x67, 0x67, 0x6d, 0x6c, 0x2e, 0x61, 0x69, 0x22, 0x3e,
-  0x67, 0x67, 0x6d, 0x6c, 0x2e, 0x61, 0x69, 0x3c, 0x2f, 0x61, 0x3e, 0x2e,
-  0x3c, 0x2f, 0x70, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x3c, 0x2f, 0x66, 0x6f, 0x6f, 0x74, 0x65, 0x72, 0x3e,
-  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x2f, 0x64,
-  0x69, 0x76, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x60, 0x3b,
-  0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20,
-  0x72, 0x65, 0x6e, 0x64, 0x65, 0x72, 0x28, 0x68, 0x28, 0x41, 0x70, 0x70,
-  0x29, 0x2c, 0x20, 0x64, 0x6f, 0x63, 0x75, 0x6d, 0x65, 0x6e, 0x74, 0x2e,
-  0x71, 0x75, 0x65, 0x72, 0x79, 0x53, 0x65, 0x6c, 0x65, 0x63, 0x74, 0x6f,
-  0x72, 0x28, 0x27, 0x23, 0x63, 0x6f, 0x6e, 0x74, 0x61, 0x69, 0x6e, 0x65,
-  0x72, 0x27, 0x29, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x3c, 0x2f, 0x73, 0x63,
-  0x72, 0x69, 0x70, 0x74, 0x3e, 0x0a, 0x3c, 0x2f, 0x68, 0x65, 0x61, 0x64,
-  0x3e, 0x0a, 0x0a, 0x3c, 0x62, 0x6f, 0x64, 0x79, 0x3e, 0x0a, 0x20, 0x20,
-  0x3c, 0x64, 0x69, 0x76, 0x20, 0x69, 0x64, 0x3d, 0x22, 0x63, 0x6f, 0x6e,
-  0x74, 0x61, 0x69, 0x6e, 0x65, 0x72, 0x22, 0x3e, 0x3c, 0x2f, 0x64, 0x69,
-  0x76, 0x3e, 0x0a, 0x20, 0x20, 0x3c, 0x64, 0x69, 0x76, 0x20, 0x69, 0x64,
-  0x3d, 0x22, 0x70, 0x6f, 0x72, 0x74, 0x61, 0x6c, 0x22, 0x3e, 0x3c, 0x2f,
-  0x64, 0x69, 0x76, 0x3e, 0x0a, 0x3c, 0x2f, 0x62, 0x6f, 0x64, 0x79, 0x3e,
-  0x0a, 0x0a, 0x3c, 0x2f, 0x68, 0x74, 0x6d, 0x6c, 0x3e, 0x0a
+  0x65, 0x2e, 0x74, 0x79, 0x70, 0x65, 0x7d, 0x22, 0x3e, 0x0a, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x68, 0x65, 0x61,
+  0x64, 0x65, 0x72, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x68, 0x31, 0x3e, 0x6c, 0x6c, 0x61,
+  0x6d, 0x61, 0x2e, 0x63, 0x70, 0x70, 0x3c, 0x2f, 0x68, 0x31, 0x3e, 0x0a,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x2f,
+  0x68, 0x65, 0x61, 0x64, 0x65, 0x72, 0x3e, 0x0a, 0x0a, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x6d, 0x61, 0x69, 0x6e,
+  0x20, 0x69, 0x64, 0x3d, 0x22, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74,
+  0x22, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x3c, 0x24, 0x7b, 0x63, 0x68, 0x61, 0x74, 0x53, 0x74,
+  0x61, 0x72, 0x74, 0x65, 0x64, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20,
+  0x3f, 0x20, 0x43, 0x68, 0x61, 0x74, 0x4c, 0x6f, 0x67, 0x20, 0x3a, 0x20,
+  0x43, 0x6f, 0x6e, 0x66, 0x69, 0x67, 0x46, 0x6f, 0x72, 0x6d, 0x7d, 0x20,
+  0x2f, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x3c, 0x2f, 0x6d, 0x61, 0x69, 0x6e, 0x3e, 0x0a, 0x0a, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x73, 0x65, 0x63,
+  0x74, 0x69, 0x6f, 0x6e, 0x20, 0x69, 0x64, 0x3d, 0x22, 0x77, 0x72, 0x69,
+  0x74, 0x65, 0x22, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x24, 0x7b, 0x73, 0x65, 0x73, 0x73,
+  0x69, 0x6f, 0x6e, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e, 0x74, 0x79,
+  0x70, 0x65, 0x20, 0x3d, 0x3d, 0x3d, 0x20, 0x27, 0x63, 0x68, 0x61, 0x74,
+  0x27, 0x20, 0x3f, 0x20, 0x4d, 0x65, 0x73, 0x73, 0x61, 0x67, 0x65, 0x49,
+  0x6e, 0x70, 0x75, 0x74, 0x20, 0x3a, 0x20, 0x43, 0x6f, 0x6d, 0x70, 0x6c,
+  0x65, 0x74, 0x69, 0x6f, 0x6e, 0x43, 0x6f, 0x6e, 0x74, 0x72, 0x6f, 0x6c,
+  0x73, 0x7d, 0x20, 0x2f, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x3c, 0x2f, 0x73, 0x65, 0x63, 0x74, 0x69, 0x6f,
+  0x6e, 0x3e, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x3c, 0x66, 0x6f, 0x6f, 0x74, 0x65, 0x72, 0x3e, 0x0a, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c,
+  0x70, 0x3e, 0x3c, 0x24, 0x7b, 0x4d, 0x6f, 0x64, 0x65, 0x6c, 0x47, 0x65,
+  0x6e, 0x65, 0x72, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x49, 0x6e, 0x66, 0x6f,
+  0x7d, 0x20, 0x2f, 0x3e, 0x3c, 0x2f, 0x70, 0x3e, 0x0a, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x70, 0x3e,
+  0x50, 0x6f, 0x77, 0x65, 0x72, 0x65, 0x64, 0x20, 0x62, 0x79, 0x20, 0x3c,
+  0x61, 0x20, 0x68, 0x72, 0x65, 0x66, 0x3d, 0x22, 0x68, 0x74, 0x74, 0x70,
+  0x73, 0x3a, 0x2f, 0x2f, 0x67, 0x69, 0x74, 0x68, 0x75, 0x62, 0x2e, 0x63,
+  0x6f, 0x6d, 0x2f, 0x67, 0x67, 0x65, 0x72, 0x67, 0x61, 0x6e, 0x6f, 0x76,
+  0x2f, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x2e, 0x63, 0x70, 0x70, 0x22, 0x3e,
+  0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x2e, 0x63, 0x70, 0x70, 0x3c, 0x2f, 0x61,
+  0x3e, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x3c, 0x61, 0x20, 0x68, 0x72, 0x65,
+  0x66, 0x3d, 0x22, 0x68, 0x74, 0x74, 0x70, 0x73, 0x3a, 0x2f, 0x2f, 0x67,
+  0x67, 0x6d, 0x6c, 0x2e, 0x61, 0x69, 0x22, 0x3e, 0x67, 0x67, 0x6d, 0x6c,
+  0x2e, 0x61, 0x69, 0x3c, 0x2f, 0x61, 0x3e, 0x2e, 0x3c, 0x2f, 0x70, 0x3e,
+  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c,
+  0x2f, 0x66, 0x6f, 0x6f, 0x74, 0x65, 0x72, 0x3e, 0x0a, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x2f, 0x64, 0x69, 0x76, 0x3e, 0x0a,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x60, 0x3b, 0x0a, 0x20, 0x20, 0x20,
+  0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x6e, 0x64,
+  0x65, 0x72, 0x28, 0x68, 0x28, 0x41, 0x70, 0x70, 0x29, 0x2c, 0x20, 0x64,
+  0x6f, 0x63, 0x75, 0x6d, 0x65, 0x6e, 0x74, 0x2e, 0x71, 0x75, 0x65, 0x72,
+  0x79, 0x53, 0x65, 0x6c, 0x65, 0x63, 0x74, 0x6f, 0x72, 0x28, 0x27, 0x23,
+  0x63, 0x6f, 0x6e, 0x74, 0x61, 0x69, 0x6e, 0x65, 0x72, 0x27, 0x29, 0x29,
+  0x3b, 0x0a, 0x20, 0x20, 0x3c, 0x2f, 0x73, 0x63, 0x72, 0x69, 0x70, 0x74,
+  0x3e, 0x0a, 0x3c, 0x2f, 0x68, 0x65, 0x61, 0x64, 0x3e, 0x0a, 0x0a, 0x3c,
+  0x62, 0x6f, 0x64, 0x79, 0x3e, 0x0a, 0x20, 0x20, 0x3c, 0x64, 0x69, 0x76,
+  0x20, 0x69, 0x64, 0x3d, 0x22, 0x63, 0x6f, 0x6e, 0x74, 0x61, 0x69, 0x6e,
+  0x65, 0x72, 0x22, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x69, 0x6e,
+  0x70, 0x75, 0x74, 0x20, 0x74, 0x79, 0x70, 0x65, 0x3d, 0x22, 0x66, 0x69,
+  0x6c, 0x65, 0x22, 0x20, 0x69, 0x64, 0x3d, 0x22, 0x66, 0x69, 0x6c, 0x65,
+  0x49, 0x6e, 0x70, 0x75, 0x74, 0x22, 0x20, 0x61, 0x63, 0x63, 0x65, 0x70,
+  0x74, 0x3d, 0x22, 0x69, 0x6d, 0x61, 0x67, 0x65, 0x2f, 0x2a, 0x22, 0x20,
+  0x73, 0x74, 0x79, 0x6c, 0x65, 0x3d, 0x22, 0x64, 0x69, 0x73, 0x70, 0x6c,
+  0x61, 0x79, 0x3a, 0x20, 0x6e, 0x6f, 0x6e, 0x65, 0x3b, 0x22, 0x3e, 0x0a,
+  0x20, 0x20, 0x3c, 0x2f, 0x64, 0x69, 0x76, 0x3e, 0x0a, 0x20, 0x20, 0x3c,
+  0x64, 0x69, 0x76, 0x20, 0x69, 0x64, 0x3d, 0x22, 0x70, 0x6f, 0x72, 0x74,
+  0x61, 0x6c, 0x22, 0x3e, 0x3c, 0x2f, 0x64, 0x69, 0x76, 0x3e, 0x0a, 0x3c,
+  0x2f, 0x62, 0x6f, 0x64, 0x79, 0x3e, 0x0a, 0x0a, 0x3c, 0x2f, 0x68, 0x74,
+  0x6d, 0x6c, 0x3e, 0x0a, 0x0a
 };
-unsigned int index_html_len = 30190;
+unsigned int index_html_len = 32105;
diff --git a/examples/server/public/index.html b/examples/server/public/index.html
index f539884e2..39d7bb93d 100644
--- a/examples/server/public/index.html
+++ b/examples/server/public/index.html
@@ -125,6 +125,7 @@
       background-color: #222;
       color: #ddd;
     }
+
     code {
       font-family: monospace;
       padding: 0.1em 0.3em;
@@ -141,7 +142,8 @@
       display: inline;
     }
 
-    header, footer {
+    header,
+    footer {
       text-align: center;
     }
 
@@ -163,6 +165,7 @@
       0% {
         background-position: 0%;
       }
+
       100% {
         background-position: 100%;
       }
@@ -181,6 +184,7 @@
         --loading-color-1: #22222200;
         --loading-color-2: #222222ff;
       }
+
       .popover-content {
         background-color: black;
       }
@@ -194,6 +198,8 @@
 
     import { llama } from '/completion.js';
     import { SchemaConverter } from '/json-schema-to-grammar.mjs';
+    let selected_image = false;
+    var slot_id = -1;
 
     const session = signal({
       prompt: "This is a conversation between User and Llama, a friendly chatbot. Llama is helpful, kind, honest, good at writing, and never fails to answer any requests immediately and with precision.",
@@ -203,6 +209,7 @@
       type: "chat",  // "chat" | "completion"
       char: "Llama",
       user: "User",
+      image_selected: ''
     })
 
     const params = signal({
@@ -220,7 +227,9 @@
       mirostat_tau: 5, // target entropy
       mirostat_eta: 0.1, // learning rate
       grammar: '',
-      n_probs: 0, // no completion_probabilities
+      n_probs: 0, // no completion_probabilities,
+      image_data: [],
+      cache_prompt: true
     })
 
     /* START: Support for storing prompt templates and parameters in borwser LocalStorage */
@@ -270,6 +279,7 @@
       // saved templates were successfuly imported.
 
       console.log('Processing saved templates and updating default template')
+      params.value = { ...params.value, image_data: [] };
 
       //console.log(importedTemplates);
       savedUserTemplates.value = importedTemplates;
@@ -294,7 +304,9 @@
 
     function userTemplateApply(t) {
       session.value = t.data.session;
+      session.value = { ...session.value, image_selected: '' };
       params.value = t.data.params;
+      params.value = { ...params.value, image_data: [] };
     }
 
     function userTemplateResetToDefaultAndApply() {
@@ -385,20 +397,25 @@
         throw new Error("already running");
       }
       controller.value = new AbortController();
-      for await (const chunk of llama(prompt, llamaParams, {controller: controller.value})) {
+      for await (const chunk of llama(prompt, llamaParams, { controller: controller.value })) {
         const data = chunk.data;
 
         if (data.stop) {
           while (
             currentMessages.length > 0 &&
             currentMessages[currentMessages.length - 1].content.match(/\n$/) != null
-            ) {
+          ) {
             currentMessages.pop();
           }
           transcriptUpdate([...history, [char, currentMessages]])
           console.log("Completion finished: '", currentMessages.map(msg => msg.content).join(''), "', summary: ", data);
         } else {
           currentMessages.push(data);
+          slot_id = data.slot_id;
+          if (selected_image && !data.multimodal) {
+            alert("The server was not compiled for multimodal or the model projector can't be loaded.");
+            return;
+          }
           transcriptUpdate([...history, [char, currentMessages]])
         }
 
@@ -419,7 +436,7 @@
 
       transcriptUpdate([...session.value.transcript, ["{{user}}", msg]])
 
-      const prompt = template(session.value.template, {
+      let prompt = template(session.value.template, {
         message: msg,
         history: session.value.transcript.flatMap(
           ([name, data]) =>
@@ -434,9 +451,12 @@
             )
         ).join("\n"),
       });
-
+      if (selected_image) {
+        prompt = `A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\nUSER:[img-10]${msg}\nASSISTANT:`;
+      }
       await runLlama(prompt, {
         ...params.value,
+        slot_id: slot_id,
         stop: ["</s>", template("{{char}}:"), template("{{user}}:")],
       }, "{{char}}");
     }
@@ -446,10 +466,11 @@
         console.log('already running...');
         return;
       }
-      const {prompt} = session.value;
+      const { prompt } = session.value;
       transcriptUpdate([...session.value.transcript, ["", prompt]]);
       await runLlama(prompt, {
         ...params.value,
+        slot_id: slot_id,
         stop: [],
       }, "");
     }
@@ -467,6 +488,27 @@
       transcriptUpdate([]);
     }
 
+    const uploadImage = (e) => {
+      e.preventDefault();
+      document.getElementById("fileInput").click();
+      document.getElementById("fileInput").addEventListener("change", function (event) {
+        const selectedFile = event.target.files[0];
+        if (selectedFile) {
+          const reader = new FileReader();
+          reader.onload = function () {
+            const image_data = reader.result;
+            session.value = { ...session.value, image_selected: image_data };
+            params.value = {
+              ...params.value, image_data: [
+                { data: image_data.replace(/data:image\/[^;]+;base64,/, ''), id: 10 }]
+            }
+          };
+          selected_image = true;
+          reader.readAsDataURL(selectedFile);
+        }
+      });
+    }
+
     function MessageInput() {
       const message = useSignal("")
 
@@ -497,6 +539,7 @@
           </div>
           <div class="right">
             <button type="submit" disabled=${generating.value}>Send</button>
+            <button onclick=${uploadImage}>Upload Image</button>
             <button onclick=${stop} disabled=${!generating.value}>Stop</button>
             <button onclick=${reset}>Reset</button>
           </div>
@@ -540,7 +583,7 @@
             data;
           message = html`<${Markdownish} text=${template(text)} />`
         }
-        if(user) {
+        if (user) {
           return html`<p key=${index}><strong>${template(user)}:</strong> ${message}</p>`
         } else {
           return html`<p key=${index}>${message}</p>`
@@ -549,6 +592,7 @@
 
       return html`
         <section id="chat" ref=${container}>
+          <img style="width: 60%;${!session.value.image_selected ? `display: none;` : ``}" src="${session.value.image_selected}"/>
           ${messages.flatMap(chatLine)}
         </section>`;
     };
@@ -567,7 +611,7 @@
           const converter = new SchemaConverter(
             grammarJsonSchemaPropOrder.value
               .split(',')
-              .reduce((acc, cur, i) => ({...acc, [cur.trim()]: i}), {})
+              .reduce((acc, cur, i) => ({ ...acc, [cur.trim()]: i }), {})
           )
           converter.visit(schema, '')
           params.value = {
@@ -579,7 +623,7 @@
         }
       }
 
-      const FloatField = ({label, max, min, name, step, value}) => {
+      const FloatField = ({ label, max, min, name, step, value }) => {
         return html`
           <div>
             <label for="${name}">${label}</label>
@@ -589,7 +633,7 @@
         `
       };
 
-      const IntField = ({label, max, min, name, value}) => {
+      const IntField = ({ label, max, min, name, value }) => {
         return html`
           <div>
             <label for="${name}">${label}</label>
@@ -672,7 +716,7 @@
             ${GrammarControl()}
           </fieldset>
       `
-    );
+      );
 
       const CompletionConfigForm = () => (
         html`
@@ -694,20 +738,20 @@
           ${session.value.type === 'chat' ? ChatConfigForm() : CompletionConfigForm()}
 
           <fieldset class="two">
-            ${IntField({label: "Predictions", max: 2048, min: -1, name: "n_predict", value: params.value.n_predict})}
-            ${FloatField({label: "Temperature", max: 1.5, min: 0.0, name: "temperature", step: 0.01, value: params.value.temperature})}
-            ${FloatField({label: "Penalize repeat sequence", max: 2.0, min: 0.0, name: "repeat_penalty", step: 0.01, value: params.value.repeat_penalty})}
-            ${IntField({label: "Consider N tokens for penalize", max: 2048, min: 0, name: "repeat_last_n", value: params.value.repeat_last_n})}
-            ${IntField({label: "Top-K sampling", max: 100, min: -1, name: "top_k", value: params.value.top_k})}
-            ${FloatField({label: "Top-P sampling", max: 1.0, min: 0.0, name: "top_p", step: 0.01, value: params.value.top_p})}
+            ${IntField({ label: "Predictions", max: 2048, min: -1, name: "n_predict", value: params.value.n_predict })}
+            ${FloatField({ label: "Temperature", max: 1.5, min: 0.0, name: "temperature", step: 0.01, value: params.value.temperature })}
+            ${FloatField({ label: "Penalize repeat sequence", max: 2.0, min: 0.0, name: "repeat_penalty", step: 0.01, value: params.value.repeat_penalty })}
+            ${IntField({ label: "Consider N tokens for penalize", max: 2048, min: 0, name: "repeat_last_n", value: params.value.repeat_last_n })}
+            ${IntField({ label: "Top-K sampling", max: 100, min: -1, name: "top_k", value: params.value.top_k })}
+            ${FloatField({ label: "Top-P sampling", max: 1.0, min: 0.0, name: "top_p", step: 0.01, value: params.value.top_p })}
           </fieldset>
           <details>
             <summary>More options</summary>
             <fieldset class="two">
-              ${FloatField({label: "TFS-Z", max: 1.0, min: 0.0, name: "tfs_z", step: 0.01, value: params.value.tfs_z})}
-              ${FloatField({label: "Typical P", max: 1.0, min: 0.0, name: "typical_p", step: 0.01, value: params.value.typical_p})}
-              ${FloatField({label: "Presence penalty", max: 1.0, min: 0.0, name: "presence_penalty", step: 0.01, value: params.value.presence_penalty})}
-              ${FloatField({label: "Frequency penalty", max: 1.0, min: 0.0, name: "frequency_penalty", step: 0.01, value: params.value.frequency_penalty})}
+              ${FloatField({ label: "TFS-Z", max: 1.0, min: 0.0, name: "tfs_z", step: 0.01, value: params.value.tfs_z })}
+              ${FloatField({ label: "Typical P", max: 1.0, min: 0.0, name: "typical_p", step: 0.01, value: params.value.typical_p })}
+              ${FloatField({ label: "Presence penalty", max: 1.0, min: 0.0, name: "presence_penalty", step: 0.01, value: params.value.presence_penalty })}
+              ${FloatField({ label: "Frequency penalty", max: 1.0, min: 0.0, name: "frequency_penalty", step: 0.01, value: params.value.frequency_penalty })}
             </fieldset>
             <hr />
             <fieldset class="three">
@@ -716,11 +760,11 @@
                 <label><input type="radio" name="mirostat" value="1" checked=${params.value.mirostat == 1} oninput=${updateParamsInt} /> Mirostat v1</label>
                 <label><input type="radio" name="mirostat" value="2" checked=${params.value.mirostat == 2} oninput=${updateParamsInt} /> Mirostat v2</label>
               </div>
-              ${FloatField({label: "Mirostat tau", max: 10.0, min: 0.0, name: "mirostat_tau", step: 0.01, value: params.value.mirostat_tau})}
-              ${FloatField({label: "Mirostat eta", max: 1.0, min: 0.0, name: "mirostat_eta", step: 0.01, value: params.value.mirostat_eta})}
+              ${FloatField({ label: "Mirostat tau", max: 10.0, min: 0.0, name: "mirostat_tau", step: 0.01, value: params.value.mirostat_tau })}
+              ${FloatField({ label: "Mirostat eta", max: 1.0, min: 0.0, name: "mirostat_eta", step: 0.01, value: params.value.mirostat_eta })}
             </fieldset>
             <fieldset>
-              ${IntField({label: "Show Probabilities", max: 10, min: 0, name: "n_probs", value: params.value.n_probs})}
+              ${IntField({ label: "Show Probabilities", max: 10, min: 0, name: "n_probs", value: params.value.n_probs })}
             </fieldset>
           </details>
         </form>
@@ -759,20 +803,20 @@
         const popoverChildren = html`
           <div class="prob-set">
             ${probs.map((p, index) => {
-              return html`
+          return html`
                 <div
                   key=${index}
                   title=${`prob: ${p.prob}`}
                   style=${{
-                    padding: '0.3em',
-                    backgroundColor: p.tok_str === content ? probColor(p.prob) : 'transparent'
-                  }}
+              padding: '0.3em',
+              backgroundColor: p.tok_str === content ? probColor(p.prob) : 'transparent'
+            }}
                 >
                   <span>${p.tok_str}: </span>
                   <span>${Math.floor(p.prob * 100)}%</span>
                 </div>
               `
-            })}
+        })}
           </div>
         `
 
@@ -851,9 +895,9 @@
               ref=${popoverRef}
               class="popover-content"
               style=${{
-                top: position.value.top,
-                left: position.value.left,
-              }}
+            top: position.value.top,
+            left: position.value.left,
+          }}
             >
               ${props.popoverChildren}
             </div>
@@ -952,8 +996,11 @@
 </head>
 
 <body>
-  <div id="container"></div>
+  <div id="container">
+    <input type="file" id="fileInput" accept="image/*" style="display: none;">
+  </div>
   <div id="portal"></div>
 </body>
 
 </html>
+
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index ee0ababb1..fd755327a 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -1,8 +1,11 @@
 #include "common.h"
 #include "llama.h"
-#include "build-info.h"
 #include "grammar-parser.h"
 
+#include "../llava/clip.h"
+
+#include "stb_image.h"
+
 #ifndef NDEBUG
 // crash the server in debug mode, otherwise send an http 500 error
 #define CPPHTTPLIB_NO_EXCEPTIONS 1
@@ -18,12 +21,14 @@
 #include "json-schema-to-grammar.mjs.hpp"
 
 #include <cstddef>
+#include <thread>
+#include <mutex>
+#include <chrono>
 
 #ifndef SERVER_VERBOSE
 #define SERVER_VERBOSE 1
 #endif
 
-using namespace httplib;
 using json = nlohmann::json;
 
 struct server_params
@@ -35,6 +40,166 @@ struct server_params
     int32_t write_timeout = 600;
 };
 
+static bool server_verbose = false;
+
+#if SERVER_VERBOSE != 1
+#define LOG_VERBOSE(MSG, ...)
+#else
+#define LOG_VERBOSE(MSG, ...)                                            \
+    do                                                                   \
+    {                                                                    \
+        if (server_verbose)                                              \
+        {                                                                \
+            server_log("VERBOSE", __func__, __LINE__, MSG, __VA_ARGS__); \
+        }                                                                \
+    } while (0)
+#endif
+
+#define LOG_ERROR(  MSG, ...) server_log("ERROR",   __func__, __LINE__, MSG, __VA_ARGS__)
+#define LOG_WARNING(MSG, ...) server_log("WARNING", __func__, __LINE__, MSG, __VA_ARGS__)
+#define LOG_INFO(   MSG, ...) server_log("INFO",    __func__, __LINE__, MSG, __VA_ARGS__)
+
+//
+// base64 utils (TODO: move to common in the future)
+//
+
+static const std::string base64_chars =
+             "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
+             "abcdefghijklmnopqrstuvwxyz"
+             "0123456789+/";
+
+static inline bool is_base64(uint8_t c)
+{
+    return (isalnum(c) || (c == '+') || (c == '/'));
+}
+
+static std::vector<uint8_t> base64_decode(std::string const &encoded_string)
+{
+    int i = 0;
+    int j = 0;
+    int in_ = 0;
+
+    int in_len = encoded_string.size();
+
+    uint8_t char_array_4[4];
+    uint8_t char_array_3[3];
+
+    std::vector<uint8_t> ret;
+
+    while (in_len-- && (encoded_string[in_] != '=') && is_base64(encoded_string[in_]))
+    {
+        char_array_4[i++] = encoded_string[in_]; in_++;
+        if (i == 4)
+        {
+            for (i = 0; i <4; i++)
+            {
+                char_array_4[i] = base64_chars.find(char_array_4[i]);
+            }
+
+            char_array_3[0] = ((char_array_4[0]      ) << 2) + ((char_array_4[1] & 0x30) >> 4);
+            char_array_3[1] = ((char_array_4[1] & 0xf) << 4) + ((char_array_4[2] & 0x3c) >> 2);
+            char_array_3[2] = ((char_array_4[2] & 0x3) << 6) +   char_array_4[3];
+
+            for (i = 0; (i < 3); i++)
+            {
+                ret.push_back(char_array_3[i]);
+            }
+            i = 0;
+        }
+    }
+
+    if (i)
+    {
+        for (j = i; j <4; j++)
+        {
+            char_array_4[j] = 0;
+        }
+
+        for (j = 0; j <4; j++)
+        {
+            char_array_4[j] = base64_chars.find(char_array_4[j]);
+        }
+
+        char_array_3[0] = ((char_array_4[0]      ) << 2) + ((char_array_4[1] & 0x30) >> 4);
+        char_array_3[1] = ((char_array_4[1] & 0xf) << 4) + ((char_array_4[2] & 0x3c) >> 2);
+        char_array_3[2] = ((char_array_4[2] & 0x3) << 6) +   char_array_4[3];
+
+        for (j = 0; (j < i - 1); j++)
+        {
+            ret.push_back(char_array_3[j]);
+        }
+    }
+
+    return ret;
+}
+
+//
+// parallel
+//
+
+enum task_type {
+    COMPLETION_TASK,
+    CANCEL_TASK
+};
+
+struct task_server {
+    int id;
+    int target_id;
+    task_type type;
+    json data;
+    bool infill_mode = false;
+    bool embedding_mode = false;
+};
+
+struct task_result {
+    int id;
+    bool stop;
+    bool error;
+    json result_json;
+};
+
+// TODO: can become bool if we can't find use of more states
+enum slot_state
+{
+    IDLE,
+    PROCESSING,
+};
+
+enum slot_command
+{
+    NONE,
+    LOAD_PROMPT,
+    RELEASE,
+};
+
+struct slot_params
+{
+    bool stream       = true;
+    bool cache_prompt = false; // remember the prompt to avoid reprocessing all prompt
+
+    uint32_t seed      = -1; // RNG seed
+    int32_t  n_keep    =  0; // number of tokens to keep from initial prompt
+    int32_t  n_predict = -1; // new tokens to predict
+
+    std::vector<std::string> antiprompt;
+
+    json input_prefix;
+    json input_suffix;
+};
+
+struct slot_image
+{
+    int32_t id;
+
+    bool request_encode_image = false;
+    float* image_embedding = nullptr;
+    int32_t image_tokens = 0;
+
+    clip_image_u8 img_data;
+
+    std::string prefix_prompt; // before of this image
+};
+
 // completion token output with probabilities
 struct completion_token_output
 {
@@ -46,6 +211,7 @@ struct completion_token_output
 
     std::vector<token_prob> probs;
     llama_token tok;
+    std::string text_to_send;
 };
 
 static size_t common_part(const std::vector<llama_token> &a, const std::vector<llama_token> &b)
@@ -90,6 +256,7 @@ static size_t find_partial_stop_string(const std::string &stop,
     return std::string::npos;
 }
 
+// TODO: reuse llama_detokenize
 template <class Iter>
 static std::string tokens_to_str(llama_context *ctx, Iter begin, Iter end)
 {
@@ -104,12 +271,13 @@ static std::string tokens_to_str(llama_context *ctx, Iter begin, Iter end)
 static void server_log(const char *level, const char *function, int line,
                        const char *message, const nlohmann::ordered_json &extra)
 {
-    nlohmann::ordered_json log{
+    nlohmann::ordered_json log
+    {
         {"timestamp", time(nullptr)},
-        {"level", level},
-        {"function", function},
-        {"line", line},
-        {"message", message},
+        {"level",     level},
+        {"function",  function},
+        {"line",      line},
+        {"message",   message},
     };
 
     if (!extra.empty())
@@ -139,7 +307,7 @@ static std::string tokens_to_output_formatted_string(const llama_context *ctx, c
 }
 
 // convert a vector of completion_token_output to json
-static json probs_vector_to_json(const llama_context *ctx, const std::vector<completion_token_output> & probs)
+static json probs_vector_to_json(const llama_context *ctx, const std::vector<completion_token_output> &probs)
 {
     json out = json::array();
     for (const auto &prob : probs)
@@ -148,78 +316,212 @@ static json probs_vector_to_json(const llama_context *ctx, const std::vector<com
         for (const auto &p : prob.probs)
         {
             std::string tok_str = tokens_to_output_formatted_string(ctx, p.tok);
-            probs_for_token.push_back(json{
+            probs_for_token.push_back(json
+            {
                 {"tok_str", tok_str},
-                {"prob", p.prob},
+                {"prob",    p.prob},
             });
         }
         std::string tok_str = tokens_to_output_formatted_string(ctx, prob.tok);
         out.push_back(json{
             {"content", tok_str},
-            {"probs", probs_for_token},
+            {"probs",   probs_for_token},
         });
     }
     return out;
 }
 
-static bool server_verbose = false;
-
-#if SERVER_VERBOSE != 1
-#define LOG_VERBOSE(MSG, ...)
-#else
-#define LOG_VERBOSE(MSG, ...)                                            \
-    do                                                                   \
-    {                                                                    \
-        if (server_verbose)                                              \
-        {                                                                \
-            server_log("VERBOSE", __func__, __LINE__, MSG, __VA_ARGS__); \
-        }                                                                \
-    } while (0)
-#endif
-
-#define LOG_ERROR(MSG, ...) server_log("ERROR", __func__, __LINE__, MSG, __VA_ARGS__)
-#define LOG_WARNING(MSG, ...) server_log("WARNING", __func__, __LINE__, MSG, __VA_ARGS__)
-#define LOG_INFO(MSG, ...) server_log("INFO", __func__, __LINE__, MSG, __VA_ARGS__)
-
-struct llama_server_context
+template <typename T>
+static T json_value(const json &body, const std::string &key, const T &default_value)
 {
-    bool stream = false;
-    bool has_next_token = false;
-    std::string generated_text;
-    std::vector<completion_token_output> generated_token_probs;
+    // Fallback null to default value
+    return body.contains(key) && !body.at(key).is_null()
+        ? body.value(key, default_value)
+        : default_value;
+}
 
-    size_t num_prompt_tokens = 0;
-    size_t num_tokens_predicted = 0;
-    size_t n_past = 0;
-    size_t n_remain = 0;
+struct llama_client_slot
+{
+    int id;
+    int task_id = -1;
+
+    struct slot_params params;
+
+    slot_state state = IDLE;
+    slot_command command = NONE;
+
+    // used to determine the slot that has been used the longest
+    int64_t t_last_used = -1;
+
+    // generation props
+    int32_t n_ctx       = 0;  // context size per slot
+    int32_t n_past      = 0;
+    int32_t n_decoded   = 0;
+    int32_t n_remaining = -1;
+    int32_t i_batch     = -1;
+
+    int32_t num_prompt_tokens           = 0;
+    int32_t num_prompt_tokens_processed = 0;
+    int32_t multibyte_pending           = 0;
 
     json prompt;
-    std::vector<llama_token> embd;
-    std::vector<llama_token> last_n_tokens;
-
-    llama_model *model = nullptr;
-    llama_context *ctx = nullptr;
-    gpt_params params;
-    llama_sampling_context ctx_sampling;
-    int n_ctx;
-
-    grammar_parser::parse_state parsed_grammar;
-    llama_grammar *grammar = nullptr;
+    std::string generated_text;
+    llama_token sampled;
+    std::vector<llama_token> cache_tokens;
+    std::vector<completion_token_output> generated_token_probs;
 
+    bool infill = false;
+    bool embedding = false;
+    bool has_next_token = true;
     bool truncated = false;
     bool stopped_eos = false;
     bool stopped_word = false;
     bool stopped_limit = false;
+
     std::string stopping_word;
-    int32_t multibyte_pending = 0;
 
-    std::mutex mutex;
+    // sampling
+    struct llama_sampling_params sparams;
+    llama_sampling_context *ctx_sampling = nullptr;
 
-    std::unique_lock<std::mutex> lock()
-    {
-        return std::unique_lock<std::mutex>(mutex);
+    // multimodal
+    std::vector<slot_image> images;
+
+    // stats
+    size_t sent_count = 0;
+    size_t sent_token_probs_index = 0;
+
+    int64_t t_start_process_prompt;
+    int64_t t_start_genereration;
+
+    double t_prompt_processing; // ms
+    double t_token_generation; // ms
+
+    void reset() {
+        num_prompt_tokens      = 0;
+        generated_text         = "";
+        truncated              = false;
+        stopped_eos            = false;
+        stopped_word           = false;
+        stopped_limit          = false;
+        stopping_word          = "";
+        multibyte_pending      = 0;
+        n_past                 = 0;
+        sent_count             = 0;
+        sent_token_probs_index = 0;
+        infill                 = false;
+
+        generated_token_probs.clear();
+
+        for (slot_image &img : images)
+        {
+            free(img.image_embedding);
+            delete[] img.img_data.data;
+            img.prefix_prompt = "";
+        }
+
+        images.clear();
+        // llama_set_rng_seed(ctx, params.seed); in batched the seed matter???????
     }
 
+    bool has_budget(gpt_params &global_params) {
+        n_remaining = -1;
+        if(params.n_predict != -1)
+        {
+            n_remaining = params.n_predict - n_decoded;
+        }
+        else if (global_params.n_predict != -1)
+        {
+            n_remaining = global_params.n_predict - n_decoded;
+        }
+        return n_remaining > 0 || n_remaining == -1; // no budget || limitless
+    }
+
+    bool available() const {
+        return state == IDLE && command == NONE;
+    }
+
+    bool is_processing() const {
+        return (state == IDLE && command == LOAD_PROMPT) || state == PROCESSING;
+    }
+
+    void add_token_string(const completion_token_output &token) {
+        if (command == RELEASE)
+        {
+            return;
+        }
+        cache_tokens.push_back(token.tok);
+        generated_token_probs.push_back(token);
+    }
+
+    void release() {
+        if (state == IDLE || state == PROCESSING)
+        {
+            t_token_generation = (ggml_time_us() - t_start_genereration) / 1e3;
+            command = RELEASE;
+        }
+    }
+
+    json get_formated_timings() {
+        return json
+        {
+            {"prompt_n",               num_prompt_tokens_processed},
+            {"prompt_ms",              t_prompt_processing},
+            {"prompt_per_token_ms",    t_prompt_processing / num_prompt_tokens_processed},
+            {"prompt_per_second",      1e3 / t_prompt_processing * num_prompt_tokens_processed},
+
+            {"predicted_n",            n_decoded},
+            {"predicted_ms",           t_token_generation},
+            {"predicted_per_token_ms", t_token_generation / n_decoded},
+            {"predicted_per_second",   1e3 / t_token_generation * n_decoded},
+        };
+    }
+
+    void print_timings() {
+        LOG_TEE("\n");
+        LOG_TEE("%s: prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n",
+            __func__, t_prompt_processing, num_prompt_tokens_processed, t_prompt_processing / num_prompt_tokens_processed, 1e3 / t_prompt_processing * num_prompt_tokens_processed);
+        LOG_TEE("%s:        eval time = %10.2f ms / %5d runs   (%8.2f ms per token, %8.2f tokens per second)\n",
+            __func__, t_token_generation, n_decoded,t_token_generation / n_decoded, 1e3 / t_token_generation * n_decoded);
+        LOG_TEE("%s:       total time = %10.2f ms\n", __func__, t_prompt_processing + t_token_generation);
+    }
+};
+
+struct llama_server_context
+{
+    llama_model *model = nullptr;
+    llama_context *ctx = nullptr;
+
+    clip_ctx *clp_ctx = nullptr;
+
+    gpt_params params;
+
+    llama_batch batch;
+
+    bool multimodal         = false;
+    bool clean_kv_cache     = true;
+    bool all_slots_are_idle = false;
+
+    int32_t id_gen;
+    int32_t n_ctx;  // total context for all clients / slots
+
+    // system prompt
+    bool system_need_update = false;
+
+    std::string              system_prompt;
+    std::vector<llama_token> system_tokens;
+
+    std::string name_user;      // this should be the antiprompt
+    std::string name_assistant;
+
+    // slots / clients
+    std::vector<llama_client_slot> slots;
+
+    std::vector<task_server> queue_tasks;
+    std::vector<task_result> queue_results;
+    std::mutex mutex_tasks;
+    std::mutex mutex_results;
+
     ~llama_server_context()
     {
         if (ctx)
@@ -234,46 +536,74 @@ struct llama_server_context
         }
     }
 
-    void rewind()
-    {
-        params.antiprompt.clear();
-        params.grammar.clear();
-        num_prompt_tokens = 0;
-        num_tokens_predicted = 0;
-        generated_text = "";
-        generated_text.reserve(n_ctx);
-        generated_token_probs.clear();
-        truncated = false;
-        stopped_eos = false;
-        stopped_word = false;
-        stopped_limit = false;
-        stopping_word = "";
-        multibyte_pending = 0;
-        n_remain = 0;
-        n_past = 0;
-
-        if (grammar != nullptr) {
-            llama_grammar_free(grammar);
-            grammar = nullptr;
-            ctx_sampling = llama_sampling_context_init(params, NULL);
-        }
-    }
-
-    bool loadModel(const gpt_params &params_)
+    bool load_model(const gpt_params &params_)
     {
         params = params_;
+        if (!params.mmproj.empty()) {
+            multimodal = true;
+            LOG_TEE("Multi Modal Mode Enabled");
+            clp_ctx = clip_model_load(params.mmproj.c_str(), /*verbosity=*/ 1);
+            if(clp_ctx == nullptr) {
+                LOG_ERROR("unable to load clip model", {{"model", params.mmproj}});
+                return false;
+            }
+
+            if (params.n_ctx < 2048) { // request larger context for the image embedding
+                params.n_ctx = 2048;
+            }
+        }
+
         std::tie(model, ctx) = llama_init_from_gpt_params(params);
         if (model == nullptr)
         {
-            LOG_ERROR("unable to load model", {{"model", params_.model}});
+            LOG_ERROR("unable to load model", {{"model", params.model}});
             return false;
         }
+
+        if (multimodal) {
+            const int n_embd_clip = clip_n_mmproj_embd(clp_ctx);
+            const int n_embd_llm  = llama_n_embd(model);
+            if (n_embd_clip != n_embd_llm) {
+                LOG_TEE("%s: embedding dim of the multimodal projector (%d) is not equal to that of LLaMA (%d). Make sure that you use the correct mmproj file.\n", __func__, n_embd_clip, n_embd_llm);
+                llama_free(ctx);
+                llama_free_model(model);
+                return false;
+            }
+        }
+
         n_ctx = llama_n_ctx(ctx);
-        last_n_tokens.resize(n_ctx);
-        std::fill(last_n_tokens.begin(), last_n_tokens.end(), 0);
+
         return true;
     }
 
+    void initialize() {
+        id_gen = 0;
+
+        // create slots
+        all_slots_are_idle = true;
+
+        const int32_t n_ctx_slot = n_ctx / params.n_parallel;
+
+        LOG_TEE("Available slots:\n");
+        for (int i = 0; i < params.n_parallel; i++)
+        {
+            llama_client_slot slot;
+
+            slot.id = i;
+            slot.n_ctx = n_ctx_slot;
+            slot.reset();
+
+            LOG_TEE(" -> Slot %i - max context: %i\n", slot.id, n_ctx_slot);
+            slots.push_back(slot);
+        }
+
+        batch = llama_batch_init(n_ctx, 0, params.n_parallel);
+
+        // empty system prompt
+        system_prompt = "";
+        system_tokens.clear();
+    }
+
     std::vector<llama_token> tokenize(const json & json_prompt, bool add_bos) const
     {
         // If `add_bos` is true, we only add BOS, when json_prompt is a string,
@@ -319,290 +649,274 @@ struct llama_server_context
         return prompt_tokens;
     }
 
-    bool loadGrammar()
-    {
-        if (!params.grammar.empty()) {
-            parsed_grammar = grammar_parser::parse(params.grammar.c_str());
-            // will be empty (default) if there are parse errors
-            if (parsed_grammar.rules.empty()) {
-                LOG_ERROR("grammar parse error", {{"grammar", params.grammar}});
-                return false;
-            }
-            grammar_parser::print_grammar(stderr, parsed_grammar);
+    llama_client_slot* get_slot(int id) {
+        int64_t t_last = ggml_time_us();
+        llama_client_slot *last_used = nullptr;
 
+        for (llama_client_slot & slot : slots)
+        {
+            if (slot.id == id && slot.available())
             {
-                auto it = params.sampling_params.logit_bias.find(llama_token_eos(ctx));
-                if (it != params.sampling_params.logit_bias.end() && it->second == -INFINITY) {
-                    LOG_WARNING("EOS token is disabled, which will cause most grammars to fail", {});
+                return &slot;
+            }
+
+            if (slot.available() && slot.t_last_used < t_last)
+            {
+                last_used = &slot;
+                t_last = slot.t_last_used;
+            }
+        }
+
+        return last_used;
+    }
+
+    bool launch_slot_with_data(llama_client_slot* &slot, json data) {
+        slot_params default_params;
+        llama_sampling_params default_sparams;
+
+        slot->params.stream           = json_value(data, "stream",            false);
+        slot->params.cache_prompt     = json_value(data, "cache_prompt",      false);
+        slot->params.n_predict        = json_value(data, "n_predict",         default_params.n_predict);
+        slot->sparams.top_k           = json_value(data, "top_k",             default_sparams.top_k);
+        slot->sparams.top_p           = json_value(data, "top_p",             default_sparams.top_p);
+        slot->sparams.tfs_z           = json_value(data, "tfs_z",             default_sparams.tfs_z);
+        slot->sparams.typical_p       = json_value(data, "typical_p",         default_sparams.typical_p);
+        slot->sparams.temp            = json_value(data, "temperature",       default_sparams.temp);
+        slot->sparams.penalty_last_n  = json_value(data, "repeat_last_n",     default_sparams.penalty_last_n);
+        slot->sparams.penalty_repeat  = json_value(data, "repeat_penalty",    default_sparams.penalty_repeat);
+        slot->sparams.penalty_freq    = json_value(data, "frequency_penalty", default_sparams.penalty_freq);
+        slot->sparams.penalty_present = json_value(data, "presence_penalty",  default_sparams.penalty_present);
+        slot->sparams.mirostat        = json_value(data, "mirostat",          default_sparams.mirostat);
+        slot->sparams.mirostat_tau    = json_value(data, "mirostat_tau",      default_sparams.mirostat_tau);
+        slot->sparams.mirostat_eta    = json_value(data, "mirostat_eta",      default_sparams.mirostat_eta);
+        slot->sparams.penalize_nl     = json_value(data, "penalize_nl",       default_sparams.penalize_nl);
+        slot->params.n_keep           = json_value(data, "n_keep",            slot->params.n_keep);
+        slot->params.seed             = json_value(data, "seed",              default_params.seed);
+        slot->sparams.grammar         = json_value(data, "grammar",           default_sparams.grammar);
+        slot->sparams.n_probs         = json_value(data, "n_probs",           default_sparams.n_probs);
+
+        // infill
+        if (data.count("input_prefix") != 0)
+        {
+            slot->params.input_prefix = data["input_prefix"];
+        }
+        else
+        {
+            slot->params.input_prefix = "";
+        }
+
+        if (data.count("input_suffix") != 0)
+        {
+            slot->params.input_suffix = data["input_suffix"];
+        }
+        else
+        {
+            slot->params.input_suffix = "";
+        }
+
+        if (data.count("prompt") != 0)
+        {
+            slot->prompt = data["prompt"];
+        }
+        else
+        {
+            slot->prompt = "";
+        }
+
+        slot->sparams.logit_bias.clear();
+
+        if (json_value(data, "ignore_eos", false))
+        {
+            slot->sparams.logit_bias[llama_token_eos(model)] = -INFINITY;
+        }
+
+        const auto &logit_bias = data.find("logit_bias");
+        if (logit_bias != data.end() && logit_bias->is_array())
+        {
+            const int n_vocab = llama_n_vocab(model);
+            for (const auto &el : *logit_bias)
+            {
+                if (el.is_array() && el.size() == 2 && el[0].is_number_integer())
+                {
+                    llama_token tok = el[0].get<llama_token>();
+                    if (tok >= 0 && tok < n_vocab)
+                    {
+                        if (el[1].is_number())
+                        {
+                            slot->sparams.logit_bias[tok] = el[1].get<float>();
+                        }
+                        else if (el[1].is_boolean() && !el[1].get<bool>())
+                        {
+                            slot->sparams.logit_bias[tok] = -INFINITY;
+                        }
+                    }
                 }
             }
-
-            std::vector<const llama_grammar_element *> grammar_rules(parsed_grammar.c_rules());
-            grammar = llama_grammar_init(
-                grammar_rules.data(), grammar_rules.size(), parsed_grammar.symbol_ids.at("root"));
         }
-        ctx_sampling = llama_sampling_context_init(params, grammar);
+
+        slot->params.antiprompt.clear();
+
+        const auto &stop = data.find("stop");
+        if (stop != data.end() && stop->is_array())
+        {
+            for (const auto &word : *stop)
+            {
+                if (!word.empty())
+                {
+                    slot->params.antiprompt.push_back(word);
+                }
+            }
+        }
+
+        if (multimodal)
+        {
+            const auto &images_data = data.find("image_data");
+            if (images_data != data.end() && images_data->is_array())
+            {
+                for (const auto &img : *images_data)
+                {
+                    std::string data_b64 = img["data"].get<std::string>();
+                    slot_image img_sl;
+                    img_sl.id = img.count("id") != 0 ? img["id"].get<int>() : slot->images.size();
+                    int width, height, channels;
+                    std::vector<uint8_t> image_buffer = base64_decode(data_b64);
+                    data_b64.clear();
+                    auto data = stbi_load_from_memory(image_buffer.data(), image_buffer.size(), &width, &height, &channels, 3);
+                    if (!data) {
+                        LOG_TEE("slot %i - failed to load image [id: %i]\n", slot->id, img_sl.id);
+                        return false;
+                    }
+                    LOG_TEE("slot %i - image loaded [id: %i] resolution (%i x %i)\n", slot->id, img_sl.id, width, height);
+                    img_sl.img_data.nx = width;
+                    img_sl.img_data.ny = height;
+                    img_sl.img_data.size = width * height * 3;
+                    img_sl.img_data.data = new uint8_t[width * height * 3]();
+                    memcpy(img_sl.img_data.data, data, width * height * 3);
+                    stbi_image_free(data);
+                    img_sl.request_encode_image = true;
+                    slot->images.push_back(img_sl);
+                }
+                // process prompt
+                // example: system prompt [img-102] user [img-103] describe [img-134] -> [{id: 102, prefix: 'system prompt '}, {id: 103, prefix: ' user '}, {id: 134, prefix: ' describe '}]}
+                if (slot->images.size() > 0 && !slot->prompt.is_array())
+                {
+                    std::string prompt = slot->prompt.get<std::string>();
+                    size_t pos = 0, begin_prefix = 0;
+                    std::string pattern = "[img-";
+                    while ((pos = prompt.find(pattern, pos)) != std::string::npos) {
+                        size_t end_prefix = pos;
+                        pos += pattern.length();
+                        size_t end_pos = prompt.find("]", pos);
+                        if (end_pos != std::string::npos)
+                        {
+                            std::string image_id = prompt.substr(pos, end_pos - pos);
+                            try
+                            {
+                                int img_id = std::stoi(image_id);
+                                bool found = false;
+                                for (slot_image &img : slot->images)
+                                {
+                                    if (img.id == img_id) {
+                                        found = true;
+                                        img.prefix_prompt = prompt.substr(begin_prefix, end_prefix - begin_prefix);
+                                        begin_prefix = end_pos + 1;
+                                        break;
+                                    }
+                                }
+                                if (!found) {
+                                    LOG_TEE("ERROR: Image with id: %i, not found.\n", img_id);
+                                    slot->images.clear();
+                                    return false;
+                                }
+                            } catch (const std::invalid_argument& e) {
+                                LOG_TEE("Invalid image number id in prompt\n");
+                                slot->images.clear();
+                                return false;
+                            }
+                        }
+                    }
+                    slot->prompt = "";
+                    slot->params.input_suffix = prompt.substr(begin_prefix);
+                    slot->params.cache_prompt = false; // multimodal doesn't support cache prompt
+                }
+            }
+        }
+
+        if (slot->ctx_sampling != nullptr)
+        {
+            llama_sampling_free(slot->ctx_sampling);
+        }
+        slot->ctx_sampling = llama_sampling_init(slot->sparams);
+        slot->command = LOAD_PROMPT;
+
+        all_slots_are_idle = false;
+
+        LOG_TEE("slot %i is processing [task id: %i]\n", slot->id, slot->task_id);
+
         return true;
     }
 
-    void loadInfill()
-    {
-        bool suff_rm_leading_spc = true;
-        if (params.input_suffix.find_first_of(" ") == 0 && params.input_suffix.size() > 1) {
-            params.input_suffix.erase(0, 1);
-            suff_rm_leading_spc = false;
-        }
-
-        auto prefix_tokens = tokenize(params.input_prefix, false);
-        auto suffix_tokens = tokenize(params.input_suffix, false);
-        const int space_token = 29871;
-        if (suff_rm_leading_spc  && suffix_tokens[0] == space_token) {
-            suffix_tokens.erase(suffix_tokens.begin());
-        }
-        prefix_tokens.insert(prefix_tokens.begin(), llama_token_prefix(ctx));
-        prefix_tokens.insert(prefix_tokens.begin(), llama_token_bos(ctx)); // always add BOS
-        prefix_tokens.insert(prefix_tokens.end(), llama_token_suffix(ctx));
-        prefix_tokens.insert(prefix_tokens.end(), suffix_tokens.begin(), suffix_tokens.end());
-        prefix_tokens.push_back(llama_token_middle(ctx));
-        auto prompt_tokens = prefix_tokens;
-
-        num_prompt_tokens = prompt_tokens.size();
-
-        if (params.n_keep < 0)
-        {
-            params.n_keep = (int)num_prompt_tokens;
-        }
-        params.n_keep = std::min(params.n_ctx - 4, params.n_keep);
-
-        // if input prompt is too big, truncate like normal
-        if (num_prompt_tokens >= (size_t)params.n_ctx)
-        {
-            printf("Input prompt is too big, truncating. Can only take %d tokens but got %zu\n", params.n_ctx, num_prompt_tokens);
-            // todo we probably want to cut from both sides
-            const int n_left = (params.n_ctx - params.n_keep) / 2;
-            std::vector<llama_token> new_tokens(prompt_tokens.begin(), prompt_tokens.begin() + params.n_keep);
-            const int erased_blocks = (num_prompt_tokens - params.n_keep - n_left - 1) / n_left;
-            new_tokens.insert(new_tokens.end(), prompt_tokens.begin() + params.n_keep + erased_blocks * n_left, prompt_tokens.end());
-            std::copy(prompt_tokens.end() - params.n_ctx, prompt_tokens.end(), last_n_tokens.begin());
-
-            LOG_VERBOSE("input truncated", {
-                                               {"n_ctx", params.n_ctx},
-                                               {"n_keep", params.n_keep},
-                                               {"n_left", n_left},
-                                               {"new_tokens", tokens_to_str(ctx, new_tokens.cbegin(), new_tokens.cend())},
-                                           });
-
-            truncated = true;
-            prompt_tokens = new_tokens;
-        }
-        else
-        {
-            const size_t ps = num_prompt_tokens;
-            std::fill(last_n_tokens.begin(), last_n_tokens.end() - ps, 0);
-            std::copy(prompt_tokens.begin(), prompt_tokens.end(), last_n_tokens.end() - ps);
-        }
-
-        // compare the evaluated prompt with the new prompt
-        n_past = common_part(embd, prompt_tokens);
-        embd = prompt_tokens;
-
-        if (n_past == num_prompt_tokens)
-        {
-            // we have to evaluate at least 1 token to generate logits.
-            printf("we have to evaluate at least 1 token to generate logits\n");
-            n_past--;
-        }
-
-        // since #3228 we now have to manually manage the KV cache
-        llama_kv_cache_seq_rm(ctx, 0, n_past, -1);
-
-        LOG_VERBOSE("prompt ingested", {
-                                           {"n_past", n_past},
-                                           {"cached", tokens_to_str(ctx, embd.cbegin(), embd.cbegin() + n_past)},
-                                           {"to_eval", tokens_to_str(ctx, embd.cbegin() + n_past, embd.cend())},
-                                       });
-
-        has_next_token = true;
-    }
-    void loadPrompt()
-    {
-        auto prompt_tokens = tokenize(prompt, true);  // always add BOS
-
-        num_prompt_tokens = prompt_tokens.size();
-
-        if (params.n_keep < 0)
-        {
-            params.n_keep = (int)num_prompt_tokens;
-        }
-        params.n_keep = std::min(n_ctx - 4, params.n_keep);
-
-        // if input prompt is too big, truncate like normal
-        if (num_prompt_tokens >= (size_t)n_ctx)
-        {
-            const int n_left = (n_ctx - params.n_keep) / 2;
-            std::vector<llama_token> new_tokens(prompt_tokens.begin(), prompt_tokens.begin() + params.n_keep);
-            const int erased_blocks = (num_prompt_tokens - params.n_keep - n_left - 1) / n_left;
-            new_tokens.insert(new_tokens.end(), prompt_tokens.begin() + params.n_keep + erased_blocks * n_left, prompt_tokens.end());
-            std::copy(prompt_tokens.end() - n_ctx, prompt_tokens.end(), last_n_tokens.begin());
-
-            LOG_VERBOSE("input truncated", {
-                                               {"n_ctx", n_ctx},
-                                               {"n_keep", params.n_keep},
-                                               {"n_left", n_left},
-                                               {"new_tokens", tokens_to_str(ctx, new_tokens.cbegin(), new_tokens.cend())},
-                                           });
-
-            truncated = true;
-            prompt_tokens = new_tokens;
-        }
-        else
-        {
-            const size_t ps = num_prompt_tokens;
-            std::fill(last_n_tokens.begin(), last_n_tokens.end() - ps, 0);
-            std::copy(prompt_tokens.begin(), prompt_tokens.end(), last_n_tokens.end() - ps);
-        }
-
-        // compare the evaluated prompt with the new prompt
-        n_past = common_part(embd, prompt_tokens);
-
-        embd = prompt_tokens;
-        if (n_past == num_prompt_tokens)
-        {
-            // we have to evaluate at least 1 token to generate logits.
-            n_past--;
-        }
-
-        // since #3228 we now have to manually manage the KV cache
-        llama_kv_cache_seq_rm(ctx, 0, n_past, -1);
-
-        LOG_VERBOSE("prompt ingested", {
-                                           {"n_past", n_past},
-                                           {"cached", tokens_to_str(ctx, embd.cbegin(), embd.cbegin() + n_past)},
-                                           {"to_eval", tokens_to_str(ctx, embd.cbegin() + n_past, embd.cend())},
-                                       });
-
-        has_next_token = true;
+    void kv_cache_clear() {
+        // clear the entire KV cache
+        llama_kv_cache_clear(ctx);
+        clean_kv_cache = false;
     }
 
-    void beginCompletion()
-    {
-        // number of tokens to keep when resetting context
-        n_remain = params.n_predict;
-        llama_set_rng_seed(ctx, params.seed);
+    void update_system_prompt() {
+        system_tokens = ::llama_tokenize(ctx, system_prompt, true);
+
+        llama_batch_clear(batch);
+
+        kv_cache_clear();
+
+        for (int i = 0; i < (int) system_tokens.size(); ++i)
+        {
+            llama_batch_add(batch, system_tokens[i], i, { 0 }, false);
+        }
+
+        if (llama_decode(ctx, batch) != 0)
+        {
+            LOG_TEE("%s: llama_decode() failed\n", __func__);
+            return;
+        }
+
+        // assign the system KV cache to all parallel sequences
+        for (int32_t i = 1; i < params.n_parallel; ++i)
+        {
+            llama_kv_cache_seq_cp(ctx, 0, i, 0, system_tokens.size());
+        }
+
+        LOG_TEE("system prompt updated\n");
+        system_need_update = false;
     }
 
-    completion_token_output nextToken()
-    {
-        completion_token_output result;
-        result.tok = -1;
-
-        if (embd.size() >= (size_t)n_ctx)
+    void notify_system_prompt_changed() {
+        // release all slots
+        for (llama_client_slot &slot : slots)
         {
-            // Shift context
-
-            const int n_left    = n_past - params.n_keep - 1;
-            const int n_discard = n_left/2;
-
-            llama_kv_cache_seq_rm   (ctx, 0, params.n_keep + 1            , params.n_keep + n_discard + 1);
-            llama_kv_cache_seq_shift(ctx, 0, params.n_keep + 1 + n_discard, n_past, -n_discard);
-
-            for (size_t i = params.n_keep + 1 + n_discard; i < embd.size(); i++)
-            {
-                embd[i - n_discard] = embd[i];
-            }
-            embd.resize(embd.size() - n_discard);
-
-            n_past -= n_discard;
-
-            truncated = true;
-            LOG_VERBOSE("input truncated", {
-                                               {"n_ctx", n_ctx},
-                                               {"n_keep", params.n_keep},
-                                               {"n_left", n_left},
-                                           });
+            slot.release();
         }
 
-        bool tg = true;
-        while (n_past < embd.size())
-        {
-            int n_eval = (int)embd.size() - n_past;
-            tg = n_eval == 1;
-            if (n_eval > params.n_batch)
-            {
-                n_eval = params.n_batch;
-            }
-
-            if (llama_decode(ctx, llama_batch_get_one(&embd[n_past], n_eval, n_past, 0)))
-            {
-                LOG_ERROR("failed to eval", {
-                                                {"n_eval", n_eval},
-                                                {"n_past", n_past},
-                                                {"embd", tokens_to_str(ctx, embd.cbegin() + n_past, embd.cend())},
-                                            });
-                has_next_token = false;
-                return result;
-            }
-            n_past += n_eval;
-        }
-
-        if (params.n_predict == 0)
-        {
-            has_next_token = false;
-            result.tok = llama_token_eos(ctx);
-            return result;
-        }
-
-        {
-            // out of user input, sample next token
-            std::vector<llama_token_data> candidates;
-            candidates.reserve(llama_n_vocab(model));
-
-            result.tok = llama_sampling_sample(ctx, NULL, ctx_sampling, last_n_tokens, candidates);
-
-            llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
-
-            const int32_t n_probs = params.sampling_params.n_probs;
-            if (params.sampling_params.temp <= 0 && n_probs > 0)
-            {
-                // For llama_sample_token_greedy we need to sort candidates
-                llama_sample_softmax(ctx, &candidates_p);
-            }
-
-            for (size_t i = 0; i < std::min(candidates_p.size, (size_t)n_probs); ++i)
-            {
-                result.probs.push_back({candidates_p.data[i].id, candidates_p.data[i].p});
-            }
-
-            last_n_tokens.erase(last_n_tokens.begin());
-            last_n_tokens.push_back(result.tok);
-            if (tg) {
-                num_tokens_predicted++;
-            }
-        }
-
-        // add it to the context
-        embd.push_back(result.tok);
-        // decrement remaining sampling budget
-        --n_remain;
-
-        if (!embd.empty() && embd.back() == llama_token_eos(ctx))
-        {
-            // stopping_word = llama_token_to_piece(ctx, embd.back());
-            has_next_token = false;
-            stopped_eos = true;
-            LOG_VERBOSE("eos token found", {});
-            return result;
-        }
-
-        has_next_token = params.n_predict == -1 || n_remain != 0;
-        return result;
+        system_need_update = true;
     }
 
-    size_t findStoppingStrings(const std::string &text, const size_t last_token_size,
-                               const stop_type type)
+    void process_system_prompt_data(const json &sys_props) {
+        system_prompt  = sys_props.value("prompt", "");
+        name_user      = sys_props.value("anti_prompt", "");
+        name_assistant = sys_props.value("assistant_name", "");
+
+        if (slots.size() > 0)
+        {
+            notify_system_prompt_changed();
+        }
+    }
+
+    static size_t find_stopping_strings(const std::string &text, const size_t last_token_size,
+                                        const stop_type type, llama_client_slot &slot)
     {
         size_t stop_pos = std::string::npos;
-        for (const std::string &word : params.antiprompt)
+
+        for (const std::string &word : slot.params.antiprompt)
         {
             size_t pos;
             if (type == STOP_FULL)
@@ -620,95 +934,815 @@ struct llama_server_context
             {
                 if (type == STOP_FULL)
                 {
-                    stopping_word = word;
-                    stopped_word = true;
-                    has_next_token = false;
+                    slot.stopped_word = true;
+                    slot.stopping_word = word;
+                    slot.has_next_token = false;
                 }
                 stop_pos = pos;
             }
         }
+
         return stop_pos;
     }
 
-    completion_token_output doCompletion()
-    {
-        auto token_with_probs = nextToken();
+    bool process_token(completion_token_output &result, llama_client_slot &slot) {
+        // remember which tokens were sampled - used for repetition penalties during sampling
+        const std::string token_str = llama_token_to_piece(ctx, result.tok);
+        slot.sampled = result.tok;
 
-        const std::string token_text = token_with_probs.tok == -1 ? "" : llama_token_to_piece(ctx, token_with_probs.tok);
-        generated_text += token_text;
+        // search stop word and delete it
+        slot.generated_text += token_str;
+        slot.has_next_token = true;
 
-        if (params.sampling_params.n_probs > 0)
+        if (slot.multibyte_pending > 0)
         {
-            generated_token_probs.push_back(token_with_probs);
+            slot.multibyte_pending -= token_str.size();
         }
-
-        if (multibyte_pending > 0)
+        else if (token_str.size() == 1)
         {
-            multibyte_pending -= token_text.size();
-        }
-        else if (token_text.size() == 1)
-        {
-            const char c = token_text[0];
+            const char c = token_str[0];
             // 2-byte characters: 110xxxxx 10xxxxxx
             if ((c & 0xE0) == 0xC0)
             {
-                multibyte_pending = 1;
+                slot.multibyte_pending = 1;
                 // 3-byte characters: 1110xxxx 10xxxxxx 10xxxxxx
             }
             else if ((c & 0xF0) == 0xE0)
             {
-                multibyte_pending = 2;
+                slot.multibyte_pending = 2;
                 // 4-byte characters: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
             }
             else if ((c & 0xF8) == 0xF0)
             {
-                multibyte_pending = 3;
+                slot.multibyte_pending = 3;
             }
             else
             {
-                multibyte_pending = 0;
+                slot.multibyte_pending = 0;
             }
         }
 
-        if (multibyte_pending > 0 && !has_next_token)
+        if (slot.multibyte_pending == 0)
         {
-            has_next_token = true;
-            n_remain++;
+            size_t pos = std::min(slot.sent_count, slot.generated_text.size());
+            const std::string str_test = slot.generated_text.substr(pos);
+            bool is_stop_full = false;
+            size_t stop_pos = find_stopping_strings(str_test, token_str.size(), STOP_FULL, slot);
+            if (stop_pos != std::string::npos)
+            {
+                is_stop_full = true;
+                slot.generated_text.erase(
+                    slot.generated_text.begin() + pos + stop_pos,
+                    slot.generated_text.end());
+                pos = std::min(slot.sent_count, slot.generated_text.size());
+            }
+            else
+            {
+                is_stop_full = false;
+                stop_pos = find_stopping_strings(str_test, token_str.size(), STOP_PARTIAL, slot);
+            }
+
+            // check if there is any token to predict
+            if (stop_pos == std::string::npos || (!slot.has_next_token && !is_stop_full && stop_pos > 0))
+            {
+                // no send the stop word in the response
+                result.text_to_send = slot.generated_text.substr(pos, std::string::npos);
+                slot.sent_count += result.text_to_send.size();
+                // add the token to slot queue and cache
+            }
+            slot.add_token_string(result);
+            if (slot.params.stream)
+            {
+                send_partial_response(slot, result);
+            }
         }
 
-        if (!has_next_token && n_remain == 0)
+        if (slot.multibyte_pending > 0 && !slot.has_next_token)
         {
-            stopped_limit = true;
+            slot.has_next_token = true;
+        }
+
+        // check the limits
+        if (slot.n_decoded > 2 && slot.has_next_token && !slot.has_budget(params))
+        {
+            slot.stopped_limit = true;
+            slot.has_next_token = false;
+        }
+
+        if (!slot.cache_tokens.empty() && result.tok == llama_token_eos(model))
+        {
+            slot.stopped_eos = true;
+            slot.has_next_token = false;
+            LOG_VERBOSE("eos token found", {});
         }
 
         LOG_VERBOSE("next token", {
-                                      {"token", token_with_probs.tok},
-                                      {"token_text", tokens_to_output_formatted_string(ctx, token_with_probs.tok)},
-                                      {"has_next_token", has_next_token},
-                                      {"n_remain", n_remain},
-                                      {"num_tokens_predicted", num_tokens_predicted},
-                                      {"stopped_eos", stopped_eos},
-                                      {"stopped_word", stopped_word},
-                                      {"stopped_limit", stopped_limit},
-                                      {"stopping_word", stopping_word},
+                                      {"token", result.tok},
+                                      {"token_text", tokens_to_output_formatted_string(ctx, result.tok)},
+                                      {"has_next_token", slot.has_next_token},
+                                      {"n_remain", slot.n_remaining},
+                                      {"num_tokens_predicted", slot.n_decoded},
+                                      {"stopped_eos", slot.stopped_eos},
+                                      {"stopped_word", slot.stopped_word},
+                                      {"stopped_limit", slot.stopped_limit},
+                                      {"stopping_word", slot.stopping_word},
                                   });
 
-        return token_with_probs;
+        return slot.has_next_token; // continue
     }
 
-    std::vector<float> getEmbedding()
+    bool process_images(llama_client_slot &slot) const
     {
-        static const int n_embd = llama_n_embd(model);
+        for (slot_image &img : slot.images)
+        {
+            if (!img.request_encode_image)
+            {
+                continue;
+            }
+            clip_image_f32 img_res;
+            if (!clip_image_preprocess(clp_ctx, &img.img_data, &img_res, /*pad2square =*/ true))
+            {
+                LOG_TEE("Error processing the given image");
+                clip_free(clp_ctx);
+                return false;
+            }
+            img.image_tokens = clip_n_patches(clp_ctx);
+            img.image_embedding = (float *)malloc(clip_embd_nbytes(clp_ctx));
+            if (!img.image_embedding)
+            {
+                LOG_TEE("Unable to allocate memory for image embeddings\n");
+                clip_free(clp_ctx);
+                return false;
+            }
+            LOG_TEE("slot %i - encoding image [id: %i]\n", slot.id, img.id);
+            if (!clip_image_encode(clp_ctx, params.n_threads, &img_res, img.image_embedding))
+            {
+                LOG_TEE("Unable to encode image\n");
+                return false;
+            }
+            img.request_encode_image = false;
+        }
+
+        return slot.images.size() > 0;
+    }
+
+    void send_error(int id, std::string error)
+    {
+        std::lock_guard<std::mutex> lock(mutex_results);
+        task_result res;
+        res.id = id;
+        res.error = true;
+        res.result_json = { { "content", error } };
+        queue_results.push_back(res);
+    }
+
+    json get_model_props()
+    {
+        return get_formated_generation(slots[0]);
+    }
+
+    json get_formated_generation(llama_client_slot &slot)
+    {
+        const auto eos_bias = slot.sparams.logit_bias.find(llama_token_eos(model));
+        const bool ignore_eos = eos_bias != slot.sparams.logit_bias.end() &&
+                                eos_bias->second < 0.0f && std::isinf(eos_bias->second);
+        return json {
+            {"n_ctx",             slot.n_ctx},
+            {"model",             params.model_alias},
+            {"seed",              slot.params.seed},
+            {"temp",              slot.sparams.temp},
+            {"top_k",             slot.sparams.top_k},
+            {"top_p",             slot.sparams.top_p},
+            {"tfs_z",             slot.sparams.tfs_z},
+            {"typical_p",         slot.sparams.typical_p},
+            {"repeat_last_n",     slot.sparams.penalty_last_n},
+            {"repeat_penalty",    slot.sparams.penalty_repeat},
+            {"presence_penalty",  slot.sparams.penalty_present},
+            {"frequency_penalty", slot.sparams.penalty_freq},
+            {"mirostat",          slot.sparams.mirostat},
+            {"mirostat_tau",      slot.sparams.mirostat_tau},
+            {"mirostat_eta",      slot.sparams.mirostat_eta},
+            {"penalize_nl",       slot.sparams.penalize_nl},
+            {"stop",              slot.params.antiprompt},
+            {"n_predict",         slot.params.n_predict},
+            {"n_keep",            params.n_keep},
+            {"ignore_eos",        ignore_eos},
+            {"stream",            slot.params.stream},
+            {"logit_bias",        slot.sparams.logit_bias},
+            {"n_probs",           slot.sparams.n_probs},
+            {"grammar",           slot.sparams.grammar},
+        };
+    }
+
+    void send_partial_response(llama_client_slot &slot, completion_token_output tkn)
+    {
+        std::lock_guard<std::mutex> lock(mutex_results);
+        task_result res;
+        res.id = slot.task_id;
+        res.error = false;
+        res.stop = false;
+
+        res.result_json = json
+        {
+            {"content",    tkn.text_to_send},
+            {"stop",       false},
+            {"slot_id",    slot.id},
+            {"multimodal", multimodal}
+        };
+
+        if (slot.sparams.n_probs > 0)
+        {
+            std::vector<completion_token_output> probs_output = {};
+            const std::vector<llama_token> to_send_toks = llama_tokenize(ctx, tkn.text_to_send, false);
+            size_t probs_pos = std::min(slot.sent_token_probs_index, slot.generated_token_probs.size());
+            size_t probs_stop_pos = std::min(slot.sent_token_probs_index + to_send_toks.size(), slot.generated_token_probs.size());
+            if (probs_pos < probs_stop_pos)
+            {
+                probs_output = std::vector<completion_token_output>(slot.generated_token_probs.begin() + probs_pos, slot.generated_token_probs.begin() + probs_stop_pos);
+            }
+            slot.sent_token_probs_index = probs_stop_pos;
+            res.result_json["completion_probabilities"] = probs_vector_to_json(ctx, probs_output);
+        }
+
+        queue_results.push_back(res);
+    }
+
+    void send_final_response(llama_client_slot &slot)
+    {
+        std::lock_guard<std::mutex> lock(mutex_results);
+        task_result res;
+        res.id = slot.task_id;
+        res.error = false;
+        res.stop = true;
+
+        res.result_json = json
+        {
+            {"content",             !slot.params.stream ? slot.generated_text : ""},
+            {"slot_id",             slot.id},
+            {"stop",                true},
+            {"model",               params.model_alias},
+            {"tokens_predicted",    slot.n_decoded},
+            {"tokens_evaluated",    slot.num_prompt_tokens},
+            {"generation_settings", get_formated_generation(slot)},
+            {"prompt",              slot.prompt},
+            {"truncated",           slot.truncated},
+            {"stopped_eos",         slot.stopped_eos},
+            {"stopped_word",        slot.stopped_word},
+            {"stopped_limit",       slot.stopped_limit},
+            {"stopping_word",       slot.stopping_word},
+            {"tokens_cached",       slot.n_past},
+            {"timings",             slot.get_formated_timings()}
+        };
+
+        if (slot.sparams.n_probs > 0)
+        {
+            std::vector<completion_token_output> probs = {};
+            if (!slot.params.stream && slot.stopped_word)
+            {
+                const std::vector<llama_token> stop_word_toks = llama_tokenize(ctx, slot.stopping_word, false);
+                probs = std::vector<completion_token_output>(slot.generated_token_probs.begin(), slot.generated_token_probs.end() - stop_word_toks.size());
+            }
+            else
+            {
+                probs = std::vector<completion_token_output>(
+                                    slot.generated_token_probs.begin(),
+                                    slot.generated_token_probs.begin() + slot.sent_token_probs_index);
+            }
+            res.result_json["completion_probabilities"] = probs_vector_to_json(ctx, probs);
+        }
+
+        queue_results.push_back(res);
+    }
+
+    void send_embedding(llama_client_slot &slot)
+    {
+        std::lock_guard<std::mutex> lock(mutex_results);
+        task_result res;
+        res.id = slot.task_id;
+        res.error = false;
+        res.stop = true;
+
+        const int n_embd = llama_n_embd(model);
         if (!params.embedding)
         {
             LOG_WARNING("embedding disabled", {
                                                   {"params.embedding", params.embedding},
                                               });
-            return std::vector<float>(n_embd, 0.0f);
+            res.result_json = json
+            {
+                {"embedding", std::vector<float>(n_embd, 0.0f)},
+            };
         }
-        const float *data = llama_get_embeddings(ctx);
-        std::vector<float> embedding(data, data + n_embd);
-        return embedding;
+        else
+        {
+            const float *data = llama_get_embeddings(ctx);
+            std::vector<float> embedding(data, data + n_embd);
+            res.result_json = json
+            {
+                {"embedding", embedding },
+            };
+        }
+        queue_results.push_back(res);
+    }
+
+    int request_completion(json data, bool infill, bool embedding)
+    {
+        std::lock_guard<std::mutex> lock(mutex_tasks);
+        task_server task;
+        task.id = id_gen++;
+        task.data = data;
+        task.infill_mode = infill;
+        task.embedding_mode = embedding;
+        task.type = COMPLETION_TASK;
+        queue_tasks.push_back(task);
+        return task.id;
+    }
+
+    task_result next_result(int task_id)
+    {
+        while (true)
+        {
+            std::this_thread::sleep_for(std::chrono::microseconds(5));
+            std::lock_guard<std::mutex> lock(mutex_results);
+
+            if (queue_results.empty())
+            {
+                continue;
+            }
+
+            for (int i = 0; i < (int) queue_results.size(); i++)
+            {
+                if (queue_results[i].id == task_id)
+                {
+                    task_result res = queue_results[i];
+                    queue_results.erase(queue_results.begin() + i);
+                    return res;
+                }
+            }
+        }
+
+        // never reached
+        //return task_result{-1, false, false, {}};
+    }
+
+    // for multiple images processing
+    bool ingest_images(llama_client_slot &slot, int n_batch)
+    {
+        int image_idx = 0;
+
+        while (image_idx < (int) slot.images.size())
+        {
+            slot_image &img = slot.images[image_idx];
+
+            // process prefix prompt
+            for (int32_t i = 0; i < (int32_t) batch.n_tokens; i += n_batch)
+            {
+                const int32_t n_tokens = std::min(n_batch, (int32_t) (batch.n_tokens - i));
+                llama_batch batch_view = {
+                    n_tokens,
+                    batch.token    + i,
+                    nullptr,
+                    batch.pos      + i,
+                    batch.n_seq_id + i,
+                    batch.seq_id   + i,
+                    batch.logits   + i,
+                    0, 0, 0, // unused
+                };
+                if (llama_decode(ctx, batch_view))
+                {
+                    LOG_TEE("%s : failed to eval\n", __func__);
+                    return false;
+                }
+            }
+
+            // process image with llm
+            for (int i = 0; i < img.image_tokens; i += n_batch)
+            {
+                int n_eval = img.image_tokens - i;
+                if (n_eval > n_batch)
+                {
+                    n_eval = n_batch;
+                }
+
+                const int n_embd = llama_n_embd(model);
+                llama_batch batch_img = { n_eval, nullptr, (img.image_embedding + i * n_embd), nullptr, nullptr, nullptr, nullptr, slot.n_past, 1, 0, };
+                if (llama_decode(ctx, batch_img))
+                {
+                    LOG_TEE("%s : failed to eval image\n", __func__);
+                    return false;
+                }
+                slot.n_past += n_eval;
+            }
+            image_idx++;
+
+            llama_batch_clear(batch);
+
+            // append prefix of next image
+            const auto json_prompt = (image_idx >= (int) slot.images.size()) ?
+                slot.params.input_suffix : // no more images, then process suffix prompt
+                (json)(slot.images[image_idx].prefix_prompt);
+
+            std::vector<llama_token> append_tokens = tokenize(json_prompt, false); // has next image
+            for (int i = 0; i < (int) append_tokens.size(); ++i)
+            {
+                llama_batch_add(batch, append_tokens[i], slot.n_past, { slot.id }, true);
+                slot.n_past += 1;
+            }
+        }
+
+        return true;
+    }
+
+    void request_cancel(int task_id)
+    {
+        std::lock_guard<std::mutex> lock(mutex_tasks);
+        task_server task;
+        task.id = id_gen++;
+        task.type = CANCEL_TASK;
+        task.target_id = task_id;
+        queue_tasks.push_back(task);
+    }
+
+    void process_tasks()
+    {
+        std::lock_guard<std::mutex> lock(mutex_tasks);
+        while (!queue_tasks.empty())
+        {
+            task_server task = queue_tasks.front();
+            queue_tasks.erase(queue_tasks.begin());
+            switch (task.type)
+            {
+                case COMPLETION_TASK: {
+                    llama_client_slot *slot = get_slot(json_value(task.data, "slot_id", -1));
+                    if (slot == nullptr)
+                    {
+                        LOG_TEE("slot unavailable\n");
+                        // send error result
+                        send_error(task.id, "slot unavailable");
+                        return;
+                    }
+
+                    if (task.data.contains("system_prompt"))
+                    {
+                        process_system_prompt_data(task.data["system_prompt"]);
+                    }
+
+                    slot->reset();
+
+                    slot->infill = task.infill_mode;
+                    slot->embedding = task.embedding_mode;
+                    slot->task_id = task.id;
+
+                    if (!launch_slot_with_data(slot, task.data))
+                    {
+                        // send error result
+                        send_error(task.id, "internal_error");
+                        break;
+                    }
+                } break;
+                case CANCEL_TASK: { // release slot linked with the task id
+                    for (auto & slot : slots)
+                    {
+                        if (slot.task_id == task.target_id)
+                        {
+                            slot.release();
+                            break;
+                        }
+                    }
+                } break;
+            }
+        }
+    }
+
+    bool update_slots() {
+        // attend tasks
+        process_tasks();
+
+        // update the system prompt wait until all slots are idle state
+        if (system_need_update && all_slots_are_idle)
+        {
+            LOG_TEE("updating system prompt\n");
+            update_system_prompt();
+        }
+
+        llama_batch_clear(batch);
+
+        if (all_slots_are_idle)
+        {
+            if (system_prompt.empty() && clean_kv_cache)
+            {
+                LOG_TEE("all slots are idle and system prompt is empty, clear the KV cache\n");
+                kv_cache_clear();
+            }
+            // avoid 100% usage of cpu all time
+            std::this_thread::sleep_for(std::chrono::milliseconds(5));
+        }
+
+        for (llama_client_slot &slot : slots)
+        {
+            if (slot.is_processing() && slot.cache_tokens.size() >= (size_t) slot.n_ctx)
+            {
+                // Shift context
+                const int n_left    = slot.n_past - slot.params.n_keep - 1;
+                const int n_discard = n_left / 2;
+
+                LOG_TEE("slot %d: context shift - n_keep = %d, n_left = %d, n_discard = %d\n", slot.id, slot.params.n_keep, n_left, n_discard);
+                llama_kv_cache_seq_rm   (ctx, slot.id, slot.params.n_keep + 1            , slot.params.n_keep + n_discard + 1);
+                llama_kv_cache_seq_shift(ctx, slot.id, slot.params.n_keep + 1 + n_discard, slot.n_past, -n_discard);
+
+                for (size_t i = slot.params.n_keep + 1 + n_discard; i < slot.cache_tokens.size(); i++)
+                {
+                    slot.cache_tokens[i - n_discard] = slot.cache_tokens[i];
+                }
+
+                slot.cache_tokens.resize(slot.cache_tokens.size() - n_discard);
+
+                slot.n_past -= n_discard;
+
+                slot.truncated = true;
+
+                LOG_VERBOSE("context shift", {
+                                                {"n_ctx",  n_ctx},
+                                                {"n_keep", params.n_keep},
+                                                {"n_left", n_left},
+                                            });
+            }
+        }
+
+        // decode any currently ongoing sequences
+        for (auto & slot : slots)
+        {
+            // release the slot
+            if (slot.command == RELEASE)
+            {
+                slot.state = IDLE;
+                slot.command = NONE;
+                slot.t_last_used = ggml_time_us();
+
+                LOG_TEE("slot %d released (%d tokens in cache)\n", slot.id, (int) slot.cache_tokens.size());
+
+                continue;
+            }
+
+            if (slot.state == IDLE)
+            {
+                continue;
+            }
+
+            slot.i_batch = batch.n_tokens;
+
+            llama_batch_add(batch, slot.sampled, system_tokens.size() + slot.n_past, { slot.id }, true);
+
+            slot.n_decoded += 1;
+            slot.n_past += 1;
+        }
+
+        // process in chunks of params.n_batch
+        int32_t n_batch = params.n_batch;
+
+        // assign workload to the slots
+        if (params.cont_batching || batch.n_tokens == 0)
+        {
+            for (auto & slot : slots)
+            {
+                const bool has_prompt = slot.prompt.is_array() || (slot.prompt.is_string() && !slot.prompt.get<std::string>().empty()) || !slot.images.empty();
+
+                // empty prompt passed -> release the slot and send empty response
+                if (slot.state == IDLE && slot.command == LOAD_PROMPT && !has_prompt)
+                {
+                    slot.release();
+                    slot.print_timings();
+                    send_final_response(slot);
+                    continue;
+                }
+
+                // need process the prompt
+                if (slot.state == IDLE && slot.command == LOAD_PROMPT)
+                {
+                    slot.state = PROCESSING;
+                    slot.command = NONE;
+                    std::vector<llama_token> prompt_tokens;
+                    slot.t_start_process_prompt = ggml_time_us();
+                    slot.t_start_genereration = 0;
+
+                    if (slot.infill)
+                    {
+                        bool suff_rm_leading_spc = true;
+                        if (params.input_suffix.find_first_of(' ') == 0 && params.input_suffix.size() > 1)
+                        {
+                            params.input_suffix.erase(0, 1);
+                            suff_rm_leading_spc = false;
+                        }
+                        auto prefix_tokens = tokenize(slot.params.input_prefix, false);
+                        auto suffix_tokens = tokenize(slot.params.input_suffix, false);
+
+                        const int space_token = 29871; // TODO: this should not be hardcoded
+                        if (suff_rm_leading_spc && !suffix_tokens.empty() && suffix_tokens[0] == space_token) {
+                            suffix_tokens.erase(suffix_tokens.begin());
+                        }
+
+                        prefix_tokens.insert(prefix_tokens.begin(), llama_token_prefix(model));
+                        prefix_tokens.insert(prefix_tokens.begin(), llama_token_bos(model)); // always add BOS
+                        prefix_tokens.insert(prefix_tokens.end(), llama_token_suffix(model));
+                        prefix_tokens.insert(prefix_tokens.end(), suffix_tokens.begin(), suffix_tokens.end());
+                        prefix_tokens.push_back(llama_token_middle(model));
+                        prompt_tokens = prefix_tokens;
+                    }
+                    else
+                    {
+                        prompt_tokens = tokenize(slot.prompt, system_prompt.empty());  // add BOS if there isn't system prompt
+                    }
+
+                    slot.num_prompt_tokens = prompt_tokens.size();
+
+                    if (!slot.params.cache_prompt)
+                    {
+                        llama_sampling_reset(slot.ctx_sampling);
+
+                        slot.n_past = 0;
+                        slot.num_prompt_tokens_processed = slot.num_prompt_tokens;
+                    }
+                    else
+                    {
+                        if (slot.params.n_keep < 0)
+                        {
+                            slot.params.n_keep = slot.num_prompt_tokens;
+                        }
+                        slot.params.n_keep = std::min(slot.n_ctx - 4, slot.params.n_keep);
+
+                        // if input prompt is too big, truncate it
+                        if (slot.num_prompt_tokens >= slot.n_ctx)
+                        {
+                            const int n_left = slot.n_ctx - slot.params.n_keep;
+                            const int n_block_size = n_left / 2;
+                            const int erased_blocks = (slot.num_prompt_tokens - slot.params.n_keep - n_block_size) / n_block_size;
+
+                            std::vector<llama_token> new_tokens(prompt_tokens.begin(), prompt_tokens.begin() + slot.params.n_keep);
+                            new_tokens.insert(new_tokens.end(), prompt_tokens.begin() + slot.params.n_keep + erased_blocks * n_block_size, prompt_tokens.end());
+
+                            LOG_VERBOSE("input truncated", {
+                                                            {"n_ctx",  slot.n_ctx},
+                                                            {"n_keep", slot.params.n_keep},
+                                                            {"n_left", n_left},
+                                                            {"new_tokens", tokens_to_str(ctx, new_tokens.cbegin(), new_tokens.cend())},
+                                                        });
+                            slot.truncated = true;
+                            prompt_tokens = new_tokens;
+
+                            slot.num_prompt_tokens = prompt_tokens.size();
+                            GGML_ASSERT(slot.num_prompt_tokens < slot.n_ctx);
+                        }
+
+                        // push the prompt into the sampling context (do not apply grammar)
+                        for (auto &token : prompt_tokens)
+                        {
+                            llama_sampling_accept(slot.ctx_sampling, ctx, token, false);
+                        }
+
+                        slot.n_past = common_part(slot.cache_tokens, prompt_tokens);
+                        slot.num_prompt_tokens_processed = slot.num_prompt_tokens - slot.n_past;
+
+                        LOG_TEE("slot %d : in cache: %i tokens | to process: %i tokens\n", slot.id, slot.n_past, slot.num_prompt_tokens_processed);
+                    }
+
+                    LOG_TEE("slot %d : kv cache rm - [%d, end)\n", slot.id, (int) system_tokens.size() + slot.n_past);
+
+                    llama_kv_cache_seq_rm(ctx, slot.id, system_tokens.size() + slot.n_past, -1);
+
+                    slot.cache_tokens = prompt_tokens;
+
+                    if (slot.n_past == slot.num_prompt_tokens)
+                    {
+                        // we have to evaluate at least 1 token to generate logits.
+                        LOG_TEE("slot %d : we have to evaluate at least 1 token to generate logits\n", slot.id);
+                        slot.n_past--;
+                    }
+
+                    LOG_VERBOSE("prompt ingested", {
+                                                    {"n_past", slot.n_past},
+                                                    {"cached", tokens_to_str(ctx, slot.cache_tokens.cbegin(), slot.cache_tokens.cbegin() + slot.n_past)},
+                                                    {"to_eval", tokens_to_str(ctx, slot.cache_tokens.cbegin() + slot.n_past, slot.cache_tokens.cend())},
+                                                });
+
+                    const bool has_images = process_images(slot);
+
+                    // process the prefix of first image
+                    std::vector<llama_token> prefix_tokens = has_images ? tokenize(slot.images[0].prefix_prompt, true) : prompt_tokens;
+                    for (; slot.n_past < (int) prefix_tokens.size(); ++slot.n_past)
+                    {
+                       llama_batch_add(batch, prefix_tokens[slot.n_past], system_tokens.size() + slot.n_past, { slot.id }, false);
+                    }
+
+                    if (has_images && !ingest_images(slot, n_batch))
+                    {
+                        LOG_TEE("failed processing images\n");
+                        return false;
+                    }
+
+                    // extract the logits only for the last token
+                    if (batch.n_tokens > 0)
+                    {
+                        batch.logits[batch.n_tokens - 1] = true;
+                    }
+
+                    slot.n_decoded = 0;
+                    slot.i_batch   = batch.n_tokens - 1;
+                }
+            }
+        }
+
+        if (batch.n_tokens == 0)
+        {
+            all_slots_are_idle = true;
+            return true;
+        }
+
+        for (int32_t i = 0; i < (int32_t) batch.n_tokens; i += n_batch)
+        {
+            const int32_t n_tokens = std::min(n_batch, (int32_t) (batch.n_tokens - i));
+            llama_batch batch_view =
+            {
+                n_tokens,
+                batch.token    + i,
+                nullptr,
+                batch.pos      + i,
+                batch.n_seq_id + i,
+                batch.seq_id   + i,
+                batch.logits   + i,
+                0, 0, 0, // unused
+            };
+
+            const int ret = llama_decode(ctx, batch_view);
+            if (ret != 0)
+            {
+                if (n_batch == 1 || ret < 0)
+                {
+                    // if you get here, it means the KV cache is full - try increasing it via the context size
+                    LOG_TEE("%s : failed to decode the batch, n_batch = %d, ret = %d\n", __func__, n_batch, ret);
+                    return false;
+                }
+
+                LOG_TEE("%s : failed to find free space in the KV cache, retrying with smaller n_batch = %d\n", __func__, n_batch / 2);
+
+                // retry with half the batch size to try to find a free slot in the KV cache
+                n_batch /= 2;
+                i -= n_batch;
+                continue;
+            }
+
+            for (auto & slot : slots)
+            {
+                if (slot.i_batch < (int) i || slot.i_batch >= (int) (i + n_tokens))
+                {
+                    continue;
+                }
+
+                // prompt evaluated for embedding
+                if (slot.embedding)
+                {
+                    send_embedding(slot);
+                    slot.release();
+                    slot.i_batch = -1;
+                    return true;
+                }
+
+                completion_token_output result;
+                const llama_token id = llama_sampling_sample(slot.ctx_sampling, ctx, NULL, slot.i_batch - i);
+
+                llama_sampling_accept(slot.ctx_sampling, ctx, id, true);
+
+                if (slot.n_decoded == 1)
+                {
+                    slot.t_start_genereration = ggml_time_us();
+                    slot.t_prompt_processing = (slot.t_start_genereration - slot.t_start_process_prompt) / 1e3;
+                }
+
+                llama_token_data_array cur_p = { slot.ctx_sampling->cur.data(), slot.ctx_sampling->cur.size(), false };
+                result.tok = id;
+
+                const int32_t n_probs = slot.sparams.n_probs;
+                if (slot.sparams.temp <= 0 && n_probs > 0)
+                {
+                    // for llama_sample_token_greedy we need to sort candidates
+                    llama_sample_softmax(ctx, &cur_p);
+                }
+
+                for (size_t i = 0; i < std::min(cur_p.size, (size_t)n_probs); ++i)
+                {
+                    result.probs.push_back({cur_p.data[i].id, cur_p.data[i].p});
+                }
+
+                if (!process_token(result, slot))
+                {
+                    slot.release();
+                    slot.print_timings();
+                    send_final_response(slot);
+                }
+
+                slot.i_batch = -1;
+            }
+        }
+        return true;
     }
 };
 
@@ -720,12 +1754,18 @@ static void server_print_usage(const char *argv0, const gpt_params &params,
     printf("options:\n");
     printf("  -h, --help                show this help message and exit\n");
     printf("  -v, --verbose             verbose output (default: %s)\n", server_verbose ? "enabled" : "disabled");
-    printf("  -t N,  --threads N        number of threads to use during computation (default: %d)\n", params.n_threads);
+    printf("  -t N, --threads N         number of threads to use during computation (default: %d)\n", params.n_threads);
     printf("  -tb N, --threads-batch N  number of threads to use during batch and prompt processing (default: same as --threads)\n");
-    printf("  -c N,  --ctx-size N       size of the prompt context (default: %d)\n", params.n_ctx);
+    printf("  -c N, --ctx-size N        size of the prompt context (default: %d)\n", params.n_ctx);
+    printf("  --rope-scaling {none,linear,yarn}\n");
+    printf("                            RoPE frequency scaling method, defaults to linear unless specified by the model\n");
     printf("  --rope-freq-base N        RoPE base frequency (default: loaded from model)\n");
-    printf("  --rope-freq-scale N       RoPE frequency scaling factor (default: loaded from model)\n");
-    printf("  -b N,  --batch-size N     batch size for prompt processing (default: %d)\n", params.n_batch);
+    printf("  --rope-freq-scale N       RoPE frequency scaling factor, expands context by a factor of 1/N\n");
+    printf("  --yarn-ext-factor N       YaRN: extrapolation mix factor (default: 1.0, 0.0 = full interpolation)\n");
+    printf("  --yarn-attn-factor N      YaRN: scale sqrt(t) or attention magnitude (default: 1.0)\n");
+    printf("  --yarn-beta-slow N        YaRN: high correction dim or alpha (default: %.1f)\n", params.yarn_beta_slow);
+    printf("  --yarn-beta-fast N        YaRN: low correction dim or beta (default: %.1f)\n", params.yarn_beta_fast);
+    printf("  -b N, --batch-size N      batch size for prompt processing (default: %d)\n", params.n_batch);
     printf("  --memory-f32              use f32 instead of f16 for memory key+value (default: disabled)\n");
     printf("                            not recommended: doubles context memory required and no measurable increase in quality\n");
     if (llama_mlock_supported())
@@ -758,11 +1798,16 @@ static void server_print_usage(const char *argv0, const gpt_params &params,
     printf("  --path PUBLIC_PATH    path from which to serve static files (default %s)\n", sparams.public_path.c_str());
     printf("  -to N, --timeout N    server read/write timeout in seconds (default: %d)\n", sparams.read_timeout);
     printf("  --embedding           enable embedding vector output (default: %s)\n", params.embedding ? "enabled" : "disabled");
+    printf("  -np N, --parallel N   number of slots for process requests (default: %d)\n", params.n_parallel);
+    printf("  -cb, --cont-batching  enable continuous batching (a.k.a dynamic batching) (default: disabled)\n");
+    printf("    -spf FNAME, --system-prompt-file FNAME\n");
+    printf("                        Set a file to load a system prompt (initial prompt of all slots), this is useful for chat applications.\n");
+    printf("  --mmproj MMPROJ_FILE  path to a multimodal projector file for LLaVA.\n");
     printf("\n");
 }
 
 static void server_params_parse(int argc, char **argv, server_params &sparams,
-                                gpt_params &params)
+                                gpt_params &params, llama_server_context& llama)
 {
     gpt_params default_params;
     server_params default_sparams;
@@ -841,6 +1886,19 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
             }
             params.n_ctx = std::stoi(argv[i]);
         }
+        else if (arg == "--rope-scaling")
+        {
+            if (++i >= argc)
+            {
+                invalid_param = true;
+                break;
+            }
+            std::string value(argv[i]);
+            /**/ if (value == "none")   { params.rope_scaling_type = LLAMA_ROPE_SCALING_NONE; }
+            else if (value == "linear") { params.rope_scaling_type = LLAMA_ROPE_SCALING_LINEAR; }
+            else if (value == "yarn")   { params.rope_scaling_type = LLAMA_ROPE_SCALING_YARN; }
+            else { invalid_param = true; break; }
+        }
         else if (arg == "--rope-freq-base")
         {
             if (++i >= argc)
@@ -859,6 +1917,38 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
             }
             params.rope_freq_scale = std::stof(argv[i]);
         }
+        else if (arg == "--yarn-ext-factor")
+        {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.yarn_ext_factor = std::stof(argv[i]);
+        }
+        else if (arg == "--yarn-attn-factor")
+        {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.yarn_attn_factor = std::stof(argv[i]);
+        }
+        else if (arg == "--yarn-beta-fast")
+        {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.yarn_beta_fast = std::stof(argv[i]);
+        }
+        else if (arg == "--yarn-beta-slow")
+        {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.yarn_beta_slow = std::stof(argv[i]);
+        }
         else if (arg == "--memory-f32" || arg == "--memory_f32")
         {
             params.memory_f16 = false;
@@ -1017,6 +2107,56 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
         {
             params.embedding = true;
         }
+        else if (arg == "-cb" || arg == "--cont-batching")
+        {
+            params.cont_batching = true;
+        }
+        else if (arg == "-np" || arg == "--parallel")
+        {
+            if (++i >= argc)
+            {
+                invalid_param = true;
+                break;
+            }
+            params.n_parallel = std::stoi(argv[i]);
+        } else if (arg == "-n" || arg == "--n-predict")
+        {
+            if (++i >= argc)
+            {
+                invalid_param = true;
+                break;
+            }
+            params.n_predict = std::stoi(argv[i]);
+        } else if (arg == "-spf" || arg == "--system-prompt-file")
+        {
+            if (++i >= argc)
+            {
+                invalid_param = true;
+                break;
+            }
+            std::ifstream file(argv[i]);
+            if (!file) {
+                fprintf(stderr, "error: failed to open file '%s'\n", argv[i]);
+                invalid_param = true;
+                break;
+            }
+            std::string systm_content;
+            std::copy(
+                std::istreambuf_iterator<char>(file),
+                std::istreambuf_iterator<char>(),
+                std::back_inserter(systm_content)
+            );
+            llama.process_system_prompt_data(json::parse(systm_content));
+        }
+        else if(arg == "--mmproj")
+        {
+            if (++i >= argc)
+            {
+                invalid_param = true;
+                break;
+            }
+            params.mmproj = argv[i];
+        }
         else
         {
             fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
@@ -1033,102 +2173,18 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
     }
 }
 
-static json format_generation_settings(llama_server_context &llama)
-{
-    const auto & sparams = llama.params.sampling_params;
-    const auto eos_bias = sparams.logit_bias.find(llama_token_eos(llama.ctx));
-    const bool ignore_eos = eos_bias != sparams.logit_bias.end() &&
-                            eos_bias->second < 0.0f && std::isinf(eos_bias->second);
-
-    return json{
-        {"n_ctx", llama.n_ctx},
-        {"model", llama.params.model_alias},
-        {"seed", llama.params.seed},
-        {"temp", sparams.temp},
-        {"top_k", sparams.top_k},
-        {"top_p", sparams.top_p},
-        {"tfs_z", sparams.tfs_z},
-        {"typical_p", sparams.typical_p},
-        {"repeat_last_n", sparams.repeat_last_n},
-        {"repeat_penalty", sparams.repeat_penalty},
-        {"presence_penalty", sparams.presence_penalty},
-        {"frequency_penalty", sparams.frequency_penalty},
-        {"mirostat", sparams.mirostat},
-        {"mirostat_tau", sparams.mirostat_tau},
-        {"mirostat_eta", sparams.mirostat_eta},
-        {"penalize_nl", sparams.penalize_nl},
-        {"stop", llama.params.antiprompt},
-        {"n_predict", llama.params.n_predict},
-        {"n_keep", llama.params.n_keep},
-        {"ignore_eos", ignore_eos},
-        {"stream", llama.stream},
-        {"logit_bias", sparams.logit_bias},
-        {"n_probs", sparams.n_probs},
-        {"grammar", llama.params.grammar},
-    };
-}
-
-static json format_embedding_response(llama_server_context &llama)
-{
-    return json{
-        {"embedding", llama.getEmbedding()},
-    };
-}
-
-static json format_timings(llama_server_context &llama)
-{
-    const auto timings = llama_get_timings(llama.ctx);
-
-    return json{
-        {"prompt_n", timings.n_p_eval},
-        {"prompt_ms", timings.t_p_eval_ms},
-        {"prompt_per_token_ms", timings.t_p_eval_ms / timings.n_p_eval},
-        {"prompt_per_second", 1e3 / timings.t_p_eval_ms * timings.n_p_eval},
-
-        {"predicted_n", timings.n_eval},
-        {"predicted_ms", timings.t_eval_ms},
-        {"predicted_per_token_ms", timings.t_eval_ms / timings.n_eval},
-        {"predicted_per_second", 1e3 / timings.t_eval_ms * timings.n_eval},
-    };
-}
-
-static json format_final_response(llama_server_context &llama, const std::string &content, const std::vector<completion_token_output> &probs)
-{
-
-    json res = json{
-        {"content", content},
-        {"stop", true},
-        {"model", llama.params.model_alias},
-        {"tokens_predicted", llama.num_tokens_predicted},
-        {"tokens_evaluated", llama.num_prompt_tokens},
-        {"generation_settings", format_generation_settings(llama)},
-        {"prompt", llama.prompt},
-        {"truncated", llama.truncated},
-        {"stopped_eos", llama.stopped_eos},
-        {"stopped_word", llama.stopped_word},
-        {"stopped_limit", llama.stopped_limit},
-        {"stopping_word", llama.stopping_word},
-        {"tokens_cached", llama.n_past},
-        {"timings", format_timings(llama)},
-    };
-
-    if (llama.params.sampling_params.n_probs > 0)
-    {
-        res["completion_probabilities"] = probs_vector_to_json(llama.ctx, probs);
-    }
-
-    return res;
-}
-
 static json format_partial_response(
-    llama_server_context &llama, const std::string &content, const std::vector<completion_token_output> &probs
+    llama_server_context &llama, llama_client_slot *slot, const std::string &content, const std::vector<completion_token_output> &probs
 ) {
-    json res = json{
-        {"content", content},
-        {"stop", false},
+    json res = json
+    {
+        {"content",    content },
+        {"stop",       false},
+        {"slot_id",    slot->id },
+        {"multimodal", llama.multimodal }
     };
 
-    if (llama.params.sampling_params.n_probs > 0)
+    if (slot->sparams.n_probs > 0)
     {
         res["completion_probabilities"] = probs_vector_to_json(llama.ctx, probs);
     }
@@ -1148,120 +2204,8 @@ static json format_detokenized_response(std::string content)
         {"content", content}};
 }
 
-template <typename T>
-static T json_value(const json &body, const std::string &key, const T &default_value)
-{
-    // Fallback null to default value
-    return body.contains(key) && !body.at(key).is_null()
-        ? body.value(key, default_value)
-        : default_value;
-}
 
-static void parse_options_completion(const json &body, llama_server_context &llama)
-{
-    gpt_params default_params;
-    const auto & default_sparams = default_params.sampling_params;
-    auto & sparams = llama.params.sampling_params;
-
-    llama.stream = json_value(body, "stream", false);
-    llama.params.n_predict = json_value(body, "n_predict", default_params.n_predict);
-    sparams.top_k = json_value(body, "top_k", default_sparams.top_k);
-    sparams.top_p = json_value(body, "top_p", default_sparams.top_p);
-    sparams.tfs_z = json_value(body, "tfs_z", default_sparams.tfs_z);
-    sparams.typical_p = json_value(body, "typical_p", default_sparams.typical_p);
-    sparams.repeat_last_n = json_value(body, "repeat_last_n", default_sparams.repeat_last_n);
-    sparams.temp = json_value(body, "temperature", default_sparams.temp);
-    sparams.repeat_penalty = json_value(body, "repeat_penalty", default_sparams.repeat_penalty);
-    sparams.presence_penalty = json_value(body, "presence_penalty", default_sparams.presence_penalty);
-    sparams.frequency_penalty = json_value(body, "frequency_penalty", default_sparams.frequency_penalty);
-    sparams.mirostat = json_value(body, "mirostat", default_sparams.mirostat);
-    sparams.mirostat_tau = json_value(body, "mirostat_tau", default_sparams.mirostat_tau);
-    sparams.mirostat_eta = json_value(body, "mirostat_eta", default_sparams.mirostat_eta);
-    sparams.penalize_nl = json_value(body, "penalize_nl", default_sparams.penalize_nl);
-    llama.params.n_keep = json_value(body, "n_keep", default_params.n_keep);
-    llama.params.seed = json_value(body, "seed", default_params.seed);
-    llama.params.grammar = json_value(body, "grammar", default_params.grammar);
-    sparams.n_probs = json_value(body, "n_probs", default_sparams.n_probs);
-
-    if (body.count("prompt") != 0)
-    {
-        llama.prompt = body["prompt"];
-    }
-    else
-    {
-        llama.prompt = "";
-    }
-
-    sparams.logit_bias.clear();
-    if (json_value(body, "ignore_eos", false))
-    {
-        sparams.logit_bias[llama_token_eos(llama.ctx)] = -INFINITY;
-    }
-
-    const auto &logit_bias = body.find("logit_bias");
-    if (logit_bias != body.end() && logit_bias->is_array())
-    {
-        const int n_vocab = llama_n_vocab(llama.model);
-        for (const auto &el : *logit_bias)
-        {
-            if (el.is_array() && el.size() == 2 && el[0].is_number_integer())
-            {
-                llama_token tok = el[0].get<llama_token>();
-                if (tok >= 0 && tok < n_vocab)
-                {
-                    if (el[1].is_number())
-                    {
-                        sparams.logit_bias[tok] = el[1].get<float>();
-                    }
-                    else if (el[1].is_boolean() && !el[1].get<bool>())
-                    {
-                        sparams.logit_bias[tok] = -INFINITY;
-                    }
-                }
-            }
-        }
-    }
-
-    llama.params.antiprompt.clear();
-    const auto &stop = body.find("stop");
-    if (stop != body.end() && stop->is_array())
-    {
-        for (const auto &word : *stop)
-        {
-            if (!word.empty())
-            {
-                llama.params.antiprompt.push_back(word);
-            }
-        }
-    }
-
-    llama.ctx_sampling = llama_sampling_context_init(llama.params, llama.grammar);
-
-    LOG_VERBOSE("completion parameters parsed", format_generation_settings(llama));
-}
-
-static void parse_options_infill(const json &body, llama_server_context &llama)
-{
-    if (body.count("input_prefix") != 0)
-    {
-        llama.params.input_prefix = body["input_prefix"];
-    }
-    else
-    {
-        llama.params.input_prefix = "";
-    }
-    if (body.count("input_suffix") != 0)
-    {
-        llama.params.input_suffix = body["input_suffix"];
-    }
-    else
-    {
-        llama.params.input_suffix = "";
-    }
-    parse_options_completion(body, llama);
-}
-
-static void log_server_request(const Request &req, const Response &res)
+static void log_server_request(const httplib::Request &req, const httplib::Response &res)
 {
     LOG_INFO("request", {
                             {"remote_addr", req.remote_addr},
@@ -1278,60 +2222,26 @@ static void log_server_request(const Request &req, const Response &res)
                            });
 }
 
-static bool is_at_eob(llama_server_context &server_context, const llama_token *tokens, const size_t n_tokens) {
-    return n_tokens && tokens[n_tokens-1] == llama_token_eos(server_context.ctx);
-}
-
-// Function matching type llama_beam_search_callback_fn_t.
-// Custom callback example is called each time the beams lengths increase:
-//  * Show progress by printing ',' following by number of convergent beam tokens if any.
-//  * When all beams converge to a common prefix, they are made available in beams_state.beams[0].
-//    This is also called when the stop condition is met.
-//    Collect tokens into std::vector<llama_token> response which is pointed to by callback_data.
-static void beam_search_callback(void *callback_data, llama_beams_state beams_state) {
-    auto & llama = *static_cast<llama_server_context*>(callback_data);
-    // Mark beams as EOS as needed.
-    for (size_t i = 0 ; i < beams_state.n_beams ; ++i) {
-        llama_beam_view& beam_view = beams_state.beam_views[i];
-        if (!beam_view.eob && is_at_eob(llama, beam_view.tokens, beam_view.n_tokens)) {
-            beam_view.eob = true;
-        }
-    }
-    printf(",");  // Show progress
-    if (const size_t n = beams_state.common_prefix_length) {
-        llama.generated_token_probs.resize(llama.generated_token_probs.size() + n);
-        assert(0u < beams_state.n_beams);
-        const llama_token * tokens = beams_state.beam_views[0].tokens;
-        const auto map = [](llama_token tok) { return completion_token_output{{},tok}; };
-        std::transform(tokens, tokens + n, llama.generated_token_probs.end() - n, map);
-        printf("%zu", n);
-    }
-    fflush(stdout);
-#if 0 // DEBUG: print current beams for this iteration
-    std::cout << "\n\nCurrent beams:\n";
-    for (size_t i=0 ; i < beams_state.n_beams ; ++i) {
-        std::cout << "beams["<<i<<"]: " << ostream_beam_view{state.ctx,beams_state.beam_views[i]} << std::endl;
-    }
-#endif
-}
-
-struct token_translator {
+struct token_translator
+{
     llama_context * ctx;
-    std::string operator()(llama_token tok) const { return llama_token_to_piece(ctx, tok); }
-    std::string operator()(const completion_token_output & cto) const { return (*this)(cto.tok); }
+    std::string operator()(llama_token tok)                    const { return llama_token_to_piece(ctx, tok); }
+    std::string operator()(const completion_token_output &cto) const { return (*this)(cto.tok); }
 };
 
-static void append_to_generated_text_from_generated_token_probs(llama_server_context &llama)
+static void append_to_generated_text_from_generated_token_probs(llama_server_context &llama, llama_client_slot *slot)
 {
-    auto & gtps = llama.generated_token_probs;
+    auto & gtps = slot->generated_token_probs;
     auto translator = token_translator{llama.ctx};
     auto add_strlen = [=](size_t sum, const completion_token_output & cto) { return sum + translator(cto).size(); };
     const size_t len = std::accumulate(gtps.begin(), gtps.end(), size_t(0), add_strlen);
-    if (llama.generated_text.capacity() < llama.generated_text.size() + len) {
-        llama.generated_text.reserve(llama.generated_text.size() + len);
+    if (slot->generated_text.capacity() < slot->generated_text.size() + len)
+    {
+        slot->generated_text.reserve(slot->generated_text.size() + len);
     }
-    for (const completion_token_output & cto : gtps) {
-        llama.generated_text += translator(cto);
+    for (const completion_token_output & cto : gtps)
+    {
+        slot->generated_text += translator(cto);
     }
 }
 
@@ -1344,7 +2254,7 @@ int main(int argc, char **argv)
     // struct that contains llama context and inference
     llama_server_context llama;
 
-    server_params_parse(argc, argv, sparams, params);
+    server_params_parse(argc, argv, sparams, params, llama);
 
     if (params.model_alias == "unknown")
     {
@@ -1353,8 +2263,9 @@ int main(int argc, char **argv)
 
     llama_backend_init(params.numa);
 
-    LOG_INFO("build info", {{"build", BUILD_NUMBER},
-                            {"commit", BUILD_COMMIT}});
+    LOG_INFO("build info", {{"build", LLAMA_BUILD_NUMBER},
+                            {"commit", LLAMA_COMMIT}});
+
     LOG_INFO("system info", {
                                 {"n_threads", params.n_threads},
                                 {"n_threads_batch", params.n_threads_batch},
@@ -1363,412 +2274,261 @@ int main(int argc, char **argv)
                             });
 
     // load the model
-    if (!llama.loadModel(params))
+    if (!llama.load_model(params))
     {
         return 1;
     }
 
-    Server svr;
+    llama.initialize();
+
+    httplib::Server svr;
 
     svr.set_default_headers({{"Server", "llama.cpp"},
                              {"Access-Control-Allow-Origin", "*"},
                              {"Access-Control-Allow-Headers", "content-type"}});
 
     // this is only called if no index.html is found in the public --path
-    svr.Get("/", [](const Request &, Response &res)
+    svr.Get("/", [](const httplib::Request &, httplib::Response &res)
             {
-        res.set_content(reinterpret_cast<const char*>(&index_html), index_html_len, "text/html");
-        return false; });
+                res.set_content(reinterpret_cast<const char*>(&index_html), index_html_len, "text/html");
+                return false;
+            });
 
     // this is only called if no index.js is found in the public --path
-    svr.Get("/index.js", [](const Request &, Response &res)
+    svr.Get("/index.js", [](const httplib::Request &, httplib::Response &res)
             {
-        res.set_content(reinterpret_cast<const char *>(&index_js), index_js_len, "text/javascript");
-        return false; });
+                res.set_content(reinterpret_cast<const char *>(&index_js), index_js_len, "text/javascript");
+                return false;
+            });
 
     // this is only called if no index.html is found in the public --path
-    svr.Get("/completion.js", [](const Request &, Response &res)
+    svr.Get("/completion.js", [](const httplib::Request &, httplib::Response &res)
             {
-        res.set_content(reinterpret_cast<const char*>(&completion_js), completion_js_len, "application/javascript");
-        return false; });
+                res.set_content(reinterpret_cast<const char*>(&completion_js), completion_js_len, "application/javascript");
+                return false;
+            });
 
     // this is only called if no index.html is found in the public --path
-    svr.Get("/json-schema-to-grammar.mjs", [](const Request &, Response &res)
+    svr.Get("/json-schema-to-grammar.mjs", [](const httplib::Request &, httplib::Response &res)
             {
-        res.set_content(reinterpret_cast<const char*>(&json_schema_to_grammar_mjs), json_schema_to_grammar_mjs_len, "application/javascript");
-        return false; });
+                res.set_content(reinterpret_cast<const char*>(&json_schema_to_grammar_mjs), json_schema_to_grammar_mjs_len, "application/javascript");
+                return false;
+            });
 
-    svr.Post("/completion", [&llama](const Request &req, Response &res)
-             {
-        auto lock = llama.lock();
+    svr.Get("/props", [&llama](const httplib::Request & /*req*/, httplib::Response &res)
+            {
+                res.set_header("Access-Control-Allow-Origin", "*");
+                json data = {
+                    { "user_name",      llama.name_user.c_str() },
+                    { "assistant_name", llama.name_assistant.c_str() }
+                };
+                res.set_content(data.dump(), "application/json");
+            });
 
-        llama.rewind();
-
-        llama_reset_timings(llama.ctx);
-
-        parse_options_completion(json::parse(req.body), llama);
-
-        if (!llama.loadGrammar())
-        {
-            res.status = 400;
-            return;
-        }
-
-        llama.loadPrompt();
-        llama.beginCompletion();
-
-        if (!llama.stream) {
-            if (llama.params.n_beams) {
-                // Fill llama.generated_token_probs vector with final beam.
-                llama_beam_search(llama.ctx, beam_search_callback, &llama, llama.params.n_beams,
-                                  llama.n_past, llama.n_remain);
-                // Translate llama.generated_token_probs to llama.generated_text.
-                append_to_generated_text_from_generated_token_probs(llama);
-            } else {
-                size_t stop_pos = std::string::npos;
-
-                while (llama.has_next_token) {
-                    const completion_token_output token_with_probs = llama.doCompletion();
-                    const std::string token_text = token_with_probs.tok == -1 ? "" : llama_token_to_piece(llama.ctx, token_with_probs.tok);
-
-                    stop_pos = llama.findStoppingStrings(llama.generated_text,
-                        token_text.size(), STOP_FULL);
-                }
-
-                if (stop_pos == std::string::npos) {
-                    stop_pos = llama.findStoppingStrings(llama.generated_text, 0, STOP_PARTIAL);
-                }
-                if (stop_pos != std::string::npos) {
-                    llama.generated_text.erase(llama.generated_text.begin() + stop_pos,
-                        llama.generated_text.end());
-                }
-            }
-
-            auto probs = llama.generated_token_probs;
-            if (llama.params.sampling_params.n_probs > 0 && llama.stopped_word) {
-                const std::vector<llama_token> stop_word_toks = llama_tokenize(llama.ctx, llama.stopping_word, false);
-                probs = std::vector<completion_token_output>(llama.generated_token_probs.begin(), llama.generated_token_probs.end() - stop_word_toks.size());
-            }
-
-            const json data = format_final_response(llama, llama.generated_text, probs);
-
-            llama_print_timings(llama.ctx);
-
-            res.set_content(data.dump(-1, ' ', false, json::error_handler_t::replace),
-                            "application/json");
-        } else {
-            const auto chunked_content_provider = [&](size_t, DataSink & sink) {
-                size_t sent_count = 0;
-                size_t sent_token_probs_index = 0;
-
-                while (llama.has_next_token) {
-                    const completion_token_output token_with_probs = llama.doCompletion();
-                    if (token_with_probs.tok == -1 || llama.multibyte_pending > 0) {
-                        continue;
+    svr.Post("/completion", [&llama](const httplib::Request &req, httplib::Response &res)
+            {
+                json data = json::parse(req.body);
+                const int task_id = llama.request_completion(data, false, false);
+                if (!json_value(data, "stream", false)) {
+                    std::string completion_text;
+                    task_result result = llama.next_result(task_id);
+                    if (!result.error && result.stop) {
+                        res.set_content(result.result_json.dump(-1, ' ', false, json::error_handler_t::replace), "application/json");
                     }
-                    const std::string token_text = llama_token_to_piece(llama.ctx, token_with_probs.tok);
-
-                    size_t pos = std::min(sent_count, llama.generated_text.size());
-
-                    const std::string str_test = llama.generated_text.substr(pos);
-                    bool is_stop_full = false;
-                    size_t stop_pos =
-                        llama.findStoppingStrings(str_test, token_text.size(), STOP_FULL);
-                    if (stop_pos != std::string::npos) {
-                        is_stop_full = true;
-                        llama.generated_text.erase(
-                            llama.generated_text.begin() + pos + stop_pos,
-                            llama.generated_text.end());
-                        pos = std::min(sent_count, llama.generated_text.size());
-                    } else {
-                        is_stop_full = false;
-                        stop_pos = llama.findStoppingStrings(str_test, token_text.size(),
-                            STOP_PARTIAL);
+                    else
+                    {
+                        res.status = 404;
+                        res.set_content(result.result_json["content"], "text/plain");
+                        return;
                     }
-
-                    if (
-                        stop_pos == std::string::npos ||
-                        // Send rest of the text if we are at the end of the generation
-                        (!llama.has_next_token && !is_stop_full && stop_pos > 0)
-                    ) {
-                        const std::string to_send = llama.generated_text.substr(pos, std::string::npos);
-
-                        sent_count += to_send.size();
-
-                        std::vector<completion_token_output> probs_output = {};
-
-                        if (llama.params.sampling_params.n_probs > 0) {
-                            const std::vector<llama_token> to_send_toks = llama_tokenize(llama.ctx, to_send, false);
-                            size_t probs_pos = std::min(sent_token_probs_index, llama.generated_token_probs.size());
-                            size_t probs_stop_pos = std::min(sent_token_probs_index + to_send_toks.size(), llama.generated_token_probs.size());
-                            if (probs_pos < probs_stop_pos) {
-                                probs_output = std::vector<completion_token_output>(llama.generated_token_probs.begin() + probs_pos, llama.generated_token_probs.begin() + probs_stop_pos);
-                            }
-                            sent_token_probs_index = probs_stop_pos;
-                        }
-
-                        const json data = format_partial_response(llama, to_send, probs_output);
-
-                        const std::string str =
-                            "data: " +
-                            data.dump(-1, ' ', false, json::error_handler_t::replace) +
-                            "\n\n";
-
-                        LOG_VERBOSE("data stream", {
-                            { "to_send", str }
-                        });
-
-                        if (!sink.write(str.data(), str.size())) {
-                            LOG_VERBOSE("stream closed", {});
-                            llama_print_timings(llama.ctx);
-                            return false;
-                        }
-                    }
-
-                    if (!llama.has_next_token) {
-                        // Generation is done, send extra information.
-                        const json data = format_final_response(
-                            llama,
-                            "",
-                            std::vector<completion_token_output>(llama.generated_token_probs.begin(), llama.generated_token_probs.begin() + sent_token_probs_index)
-                        );
-
-                        const std::string str =
-                            "data: " +
-                            data.dump(-1, ' ', false, json::error_handler_t::replace) +
-                            "\n\n";
-
-                        LOG_VERBOSE("data stream", {
-                            { "to_send", str }
-                        });
-
-                        if (!sink.write(str.data(), str.size())) {
-                            LOG_VERBOSE("stream closed", {});
-                            llama_print_timings(llama.ctx);
-                            return false;
-                        }
-                    }
-                }
-
-                llama_print_timings(llama.ctx);
-                sink.done();
-                return true;
-            };
-            const auto on_complete = [&](bool) {
-                llama.mutex.unlock();
-            };
-            lock.release();
-            res.set_chunked_content_provider("text/event-stream", chunked_content_provider, on_complete);
-        } });
-
-    svr.Post("/infill", [&llama](const Request &req, Response &res)
-             {
-        auto lock = llama.lock();
-
-        llama.rewind();
-
-        llama_reset_timings(llama.ctx);
-
-        parse_options_infill(json::parse(req.body), llama);
-
-        if (!llama.loadGrammar())
-        {
-            res.status = 400;
-            return;
-        }
-        llama.loadInfill();
-        llama.beginCompletion();
-        const auto chunked_content_provider = [&](size_t, DataSink & sink) {
-            size_t sent_count = 0;
-            size_t sent_token_probs_index = 0;
-
-            while (llama.has_next_token) {
-                const completion_token_output token_with_probs = llama.doCompletion();
-                if (token_with_probs.tok == -1 || llama.multibyte_pending > 0) {
-                    continue;
-                }
-                const std::string token_text = llama_token_to_piece(llama.ctx, token_with_probs.tok);
-
-                size_t pos = std::min(sent_count, llama.generated_text.size());
-
-                const std::string str_test = llama.generated_text.substr(pos);
-                bool is_stop_full = false;
-                size_t stop_pos =
-                    llama.findStoppingStrings(str_test, token_text.size(), STOP_FULL);
-                if (stop_pos != std::string::npos) {
-                    is_stop_full = true;
-                    llama.generated_text.erase(
-                        llama.generated_text.begin() + pos + stop_pos,
-                        llama.generated_text.end());
-                    pos = std::min(sent_count, llama.generated_text.size());
                 } else {
-                    is_stop_full = false;
-                    stop_pos = llama.findStoppingStrings(str_test, token_text.size(),
-                        STOP_PARTIAL);
-                }
-
-                if (
-                    stop_pos == std::string::npos ||
-                    // Send rest of the text if we are at the end of the generation
-                    (!llama.has_next_token && !is_stop_full && stop_pos > 0)
-                ) {
-                    const std::string to_send = llama.generated_text.substr(pos, std::string::npos);
-
-                    sent_count += to_send.size();
-
-                    std::vector<completion_token_output> probs_output = {};
-
-                    if (llama.params.sampling_params.n_probs > 0) {
-                        const std::vector<llama_token> to_send_toks = llama_tokenize(llama.ctx, to_send, false);
-                        size_t probs_pos = std::min(sent_token_probs_index, llama.generated_token_probs.size());
-                        size_t probs_stop_pos = std::min(sent_token_probs_index + to_send_toks.size(), llama.generated_token_probs.size());
-                        if (probs_pos < probs_stop_pos) {
-                            probs_output = std::vector<completion_token_output>(llama.generated_token_probs.begin() + probs_pos, llama.generated_token_probs.begin() + probs_stop_pos);
+                    const auto chunked_content_provider = [task_id, &llama](size_t, httplib::DataSink & sink)
+                    {
+                        while (true)
+                        {
+                            task_result result = llama.next_result(task_id);
+                            if (!result.error) {
+                                const std::string str =
+                                "data: " +
+                                result.result_json.dump(-1, ' ', false, json::error_handler_t::replace) +
+                                "\n\n";
+                                LOG_VERBOSE("data stream", {
+                                    { "to_send", str }
+                                });
+                                if (!sink.write(str.c_str(), str.size()))
+                                {
+                                    return false;
+                                }
+                                if (result.stop) {
+                                    break;
+                                }
+                            } else {
+                                break;
+                            }
                         }
-                        sent_token_probs_index = probs_stop_pos;
-                    }
+                        sink.done();
+                        return true;
+                    };
 
-                    const json data = format_partial_response(llama, to_send, probs_output);
+                    auto on_complete = [task_id, &llama] (bool)
+                    {
+                        // cancel
+                        llama.request_cancel(task_id);
+                    };
 
-                    const std::string str =
-                        "data: " +
-                        data.dump(-1, ' ', false, json::error_handler_t::replace) +
-                        "\n\n";
-
-                    LOG_VERBOSE("data stream", {
-                        { "to_send", str }
-                    });
-
-                    if (!sink.write(str.data(), str.size())) {
-                        LOG_VERBOSE("stream closed", {});
-                        llama_print_timings(llama.ctx);
-                        return false;
-                    }
+                    res.set_chunked_content_provider("text/event-stream", chunked_content_provider, on_complete);
                 }
+            });
 
-                if (!llama.has_next_token) {
-                    // Generation is done, send extra information.
-                    const json data = format_final_response(
-                        llama,
-                        "",
-                        std::vector<completion_token_output>(llama.generated_token_probs.begin(), llama.generated_token_probs.begin() + sent_token_probs_index)
-                    );
-
-                    const std::string str =
-                        "data: " +
-                        data.dump(-1, ' ', false, json::error_handler_t::replace) +
-                        "\n\n";
-
-                    LOG_VERBOSE("data stream", {
-                        { "to_send", str }
-                    });
-
-                    if (!sink.write(str.data(), str.size())) {
-                        LOG_VERBOSE("stream closed", {});
-                        llama_print_timings(llama.ctx);
-                        return false;
-                    }
-                }
-            }
-
-            llama_print_timings(llama.ctx);
-            sink.done();
-            return true;
-        };
-        const auto on_complete = [&](bool) {
-            llama.mutex.unlock();
-        };
-        lock.release();
-        res.set_chunked_content_provider("text/event-stream", chunked_content_provider, on_complete);
-        });
-
-    svr.Get("/model.json", [&llama](const Request &, Response &res)
+    svr.Post("/infill", [&llama](const httplib::Request &req, httplib::Response &res)
             {
-        const json data = format_generation_settings(llama);
-        return res.set_content(data.dump(), "application/json"); });
+                json data = json::parse(req.body);
+                const int task_id = llama.request_completion(data, true, false);
+                if (!json_value(data, "stream", false)) {
+                    std::string completion_text;
+                    task_result result = llama.next_result(task_id);
+                    if (!result.error && result.stop)
+                    {
+                        res.set_content(result.result_json.dump(-1, ' ', false, json::error_handler_t::replace), "application/json");
+                    }
+                    else
+                    {
+                        res.status = 404;
+                        res.set_content(result.result_json["content"], "text/plain");
+                        return;
+                    }
+                } else {
+                    const auto chunked_content_provider = [task_id, &llama](size_t, httplib::DataSink & sink) {
+                        while (true)
+                        {
+                            task_result result = llama.next_result(task_id);
+                            if (!result.error) {
+                                const std::string str =
+                                "data: " +
+                                result.result_json.dump(-1, ' ', false, json::error_handler_t::replace) +
+                                "\n\n";
+                                LOG_VERBOSE("data stream", {
+                                    { "to_send", str }
+                                });
+                                if (!sink.write(str.c_str(), str.size()))
+                                {
+                                    return false;
+                                }
+                                if (result.stop)
+                                {
+                                    break;
+                                }
+                            }
+                            else
+                            {
+                                break;
+                            }
+                        }
 
-    svr.Options(R"(/.*)", [](const Request &, Response &res)
+                        sink.done();
+
+                        return true;
+                    };
+
+                    auto on_complete = [task_id, &llama] (bool)
+                    {
+                        // cancel
+                        llama.request_cancel(task_id);
+                    };
+
+                    res.set_chunked_content_provider("text/event-stream", chunked_content_provider, on_complete);
+                }
+            });
+
+    svr.Get("/model.json", [&llama](const httplib::Request &, httplib::Response &res)
+            {
+                const json data = llama.get_model_props();
+                return res.set_content(data.dump(), "application/json");
+            });
+
+    svr.Options(R"(/.*)", [](const httplib::Request &, httplib::Response &res)
                 { return res.set_content("", "application/json"); });
 
-    svr.Post("/tokenize", [&llama](const Request &req, Response &res)
-             {
-        auto lock = llama.lock();
+    svr.Post("/tokenize", [&llama](const httplib::Request &req, httplib::Response &res)
+            {
+                const json body = json::parse(req.body);
+                std::vector<llama_token> tokens;
+                if (body.count("content") != 0)
+                {
+                    tokens = llama.tokenize(body["content"], false);
+                }
+                const json data = format_tokenizer_response(tokens);
+                return res.set_content(data.dump(), "application/json");
+            });
 
-        const json body = json::parse(req.body);
-        std::vector<llama_token> tokens;
-        if (body.count("content") != 0)
-        {
-            tokens = llama.tokenize(body["content"], false);
-        }
-        const json data = format_tokenizer_response(tokens);
-        return res.set_content(data.dump(), "application/json"); });
+    svr.Post("/detokenize", [&llama](const httplib::Request &req, httplib::Response &res)
+            {
+                const json body = json::parse(req.body);
+                std::string content;
+                if (body.count("tokens") != 0)
+                {
+                    const std::vector<llama_token> tokens = body["tokens"];
+                    content = tokens_to_str(llama.ctx, tokens.cbegin(), tokens.cend());
+                }
 
-    svr.Post("/detokenize", [&llama](const Request &req, Response &res)
-             {
-        auto lock = llama.lock();
+                const json data = format_detokenized_response(content);
+                return res.set_content(data.dump(), "application/json");
+            });
 
-        const json body = json::parse(req.body);
-        std::string content;
-        if (body.count("tokens") != 0)
-        {
-            const std::vector<llama_token> tokens = body["tokens"];
-            content = tokens_to_str(llama.ctx, tokens.cbegin(), tokens.cend());
-        }
-
-        const json data = format_detokenized_response(content);
-        return res.set_content(data.dump(), "application/json"); });
-
-    svr.Post("/embedding", [&llama](const Request &req, Response &res)
-             {
-        auto lock = llama.lock();
-
-        const json body = json::parse(req.body);
-
-        llama.rewind();
-        llama_reset_timings(llama.ctx);
-        if (body.count("content") != 0)
-        {
-            llama.prompt = body["content"];
-        }
-        else
-        {
-            llama.prompt = "";
-        }
-        llama.params.n_predict = 0;
-        llama.loadPrompt();
-        llama.beginCompletion();
-        llama.doCompletion();
-
-        const json data = format_embedding_response(llama);
-        return res.set_content(data.dump(), "application/json"); });
+    svr.Post("/embedding", [&llama](const httplib::Request &req, httplib::Response &res)
+            {
+                const json body = json::parse(req.body);
+                json prompt;
+                if (body.count("content") != 0)
+                {
+                    prompt = body["content"];
+                }
+                else
+                {
+                    prompt = "";
+                }
+                const int task_id = llama.request_completion({ {"prompt", prompt}, { "n_predict", 0} }, false, true);
+                task_result result = llama.next_result(task_id);
+                return res.set_content(result.result_json.dump(), "application/json");
+            });
 
     svr.set_logger(log_server_request);
 
-    svr.set_exception_handler([](const Request &, Response &res, std::exception_ptr ep)
-                              {
-        const char fmt[] = "500 Internal Server Error\n%s";
-        char buf[BUFSIZ];
-        try {
-            std::rethrow_exception(std::move(ep));
-        } catch (std::exception & e) {
-            snprintf(buf, sizeof(buf), fmt, e.what());
-        } catch (...) {
-            snprintf(buf, sizeof(buf), fmt, "Unknown Exception");
-        }
-        res.set_content(buf, "text/plain");
-        res.status = 500; });
+    svr.set_exception_handler([](const httplib::Request &, httplib::Response &res, std::exception_ptr ep)
+            {
+                const char fmt[] = "500 Internal Server Error\n%s";
+                char buf[BUFSIZ];
+                try
+                {
+                    std::rethrow_exception(std::move(ep));
+                }
+                catch (std::exception &e)
+                {
+                    snprintf(buf, sizeof(buf), fmt, e.what());
+                }
+                catch (...)
+                {
+                    snprintf(buf, sizeof(buf), fmt, "Unknown Exception");
+                }
+                res.set_content(buf, "text/plain");
+                res.status = 500;
+            });
 
-    svr.set_error_handler([](const Request &, Response &res)
-                          {
-        if (res.status == 400) {
-            res.set_content("Invalid request", "text/plain");
-        } else if (res.status != 500) {
-            res.set_content("File Not Found", "text/plain");
-            res.status = 404;
-        } });
+    svr.set_error_handler([](const httplib::Request &, httplib::Response &res)
+            {
+                if (res.status == 400)
+                {
+                    res.set_content("Invalid request", "text/plain");
+                }
+                else if (res.status != 500)
+                {
+                    res.set_content("File Not Found", "text/plain");
+                    res.status = 404;
+                }
+            });
 
     // set timeouts and change hostname and port
-    svr.set_read_timeout(sparams.read_timeout);
+    svr.set_read_timeout (sparams.read_timeout);
     svr.set_write_timeout(sparams.write_timeout);
 
     if (!svr.bind_to_port(sparams.hostname, sparams.port))
@@ -1781,22 +2541,38 @@ int main(int argc, char **argv)
     svr.set_base_dir(sparams.public_path);
 
     // to make it ctrl+clickable:
-    printf("\nllama server listening at http://%s:%d\n\n", sparams.hostname.c_str(), sparams.port);
+    LOG_TEE("\nllama server listening at http://%s:%d\n\n", sparams.hostname.c_str(), sparams.port);
 
     LOG_INFO("HTTP server listening", {
                                           {"hostname", sparams.hostname},
                                           {"port", sparams.port},
                                       });
 
-    if (!svr.listen_after_bind())
+    // run the HTTP server in a thread - see comment below
+    std::thread t([&]()
+            {
+                if (!svr.listen_after_bind())
+                {
+                    return 1;
+                }
+
+                return 0;
+            });
+
+    // GG: if I put the main loop inside a thread, it crashes on the first request when build in Debug!?
+    //     "Bus error: 10" - this is on macOS, it does not crash on Linux
+    //std::thread t2([&]()
     {
-        return 1;
+        bool running = true;
+        while (running)
+        {
+            running = llama.update_slots();
+        }
     }
+    //);
+
+    t.join();
 
-    if (llama.grammar != nullptr) {
-        llama_grammar_free(llama.grammar);
-    }
     llama_backend_free();
-
     return 0;
 }
diff --git a/examples/simple/simple.cpp b/examples/simple/simple.cpp
index 24fb16b78..374aef6f1 100644
--- a/examples/simple/simple.cpp
+++ b/examples/simple/simple.cpp
@@ -92,16 +92,11 @@ int main(int argc, char ** argv) {
     // create a llama_batch with size 512
     // we use this object to submit token data for decoding
 
-    llama_batch batch = llama_batch_init(512, 0);
+    llama_batch batch = llama_batch_init(512, 0, 1);
 
     // evaluate the initial prompt
-    batch.n_tokens = tokens_list.size();
-
-    for (int32_t i = 0; i < batch.n_tokens; i++) {
-        batch.token[i]  = tokens_list[i];
-        batch.pos[i]    = i;
-        batch.seq_id[i] = 0;
-        batch.logits[i] = false;
+    for (size_t i = 0; i < tokens_list.size(); i++) {
+        llama_batch_add(batch, tokens_list[i], i, { 0 }, false);
     }
 
     // llama_decode will output logits only for the last token of the prompt
@@ -138,7 +133,7 @@ int main(int argc, char ** argv) {
             const llama_token new_token_id = llama_sample_token_greedy(ctx, &candidates_p);
 
             // is it an end of stream?
-            if (new_token_id == llama_token_eos(ctx) || n_cur == n_len) {
+            if (new_token_id == llama_token_eos(model) || n_cur == n_len) {
                 LOG_TEE("\n");
 
                 break;
@@ -148,15 +143,10 @@ int main(int argc, char ** argv) {
             fflush(stdout);
 
             // prepare the next batch
-            batch.n_tokens = 0;
+            llama_batch_clear(batch);
 
             // push this new token for next evaluation
-            batch.token [batch.n_tokens] = new_token_id;
-            batch.pos   [batch.n_tokens] = n_cur;
-            batch.seq_id[batch.n_tokens] = 0;
-            batch.logits[batch.n_tokens] = true;
-
-            batch.n_tokens += 1;
+            llama_batch_add(batch, new_token_id, n_cur, { 0 }, true);
 
             n_decode += 1;
         }
diff --git a/examples/speculative/CMakeLists.txt b/examples/speculative/CMakeLists.txt
index 6c5c9456e..810f3c46a 100644
--- a/examples/speculative/CMakeLists.txt
+++ b/examples/speculative/CMakeLists.txt
@@ -3,6 +3,3 @@ add_executable(${TARGET} speculative.cpp)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
-if(TARGET BUILD_INFO)
-  add_dependencies(${TARGET} BUILD_INFO)
-endif()
diff --git a/examples/speculative/speculative.cpp b/examples/speculative/speculative.cpp
index 018dbf9a2..798684f66 100644
--- a/examples/speculative/speculative.cpp
+++ b/examples/speculative/speculative.cpp
@@ -1,14 +1,27 @@
-#include "build-info.h"
-
 #include "common.h"
 #include "llama.h"
-#include "grammar-parser.h"
 
 #include <cmath>
 #include <cstdio>
 #include <string>
 #include <vector>
 
+#define SPEC_VOCAB_MAX_SIZE_DIFFERENCE  100
+#define SPEC_VOCAB_CHECK_START_TOKEN_ID 5
+
+struct seq_draft {
+    bool active   = false;
+    bool drafting = false;
+    bool skip     = false;
+
+    int i_batch_dft = 0;
+    std::vector<int> i_batch_tgt;
+
+    std::vector<llama_token> tokens;
+
+    struct llama_sampling_context * ctx_sampling;
+};
+
 int main(int argc, char ** argv) {
     gpt_params params;
 
@@ -21,6 +34,13 @@ int main(int argc, char ** argv) {
         return 1;
     }
 
+    // max number of parallel drafting sequences (i.e. tree branches)
+    const int n_seq_dft = params.n_parallel;
+
+    // TODO: make this configurable
+    const float p_accept = 0.80f;
+    const float p_split  = 0.10f;
+
 #ifndef LOG_DISABLE_LOGS
     log_set_target(log_filename_generator("speculative", "log"));
     LOG_TEE("Log start\n");
@@ -45,6 +65,33 @@ int main(int argc, char ** argv) {
     params.n_gpu_layers = params.n_gpu_layers_draft;
     std::tie(model_dft, ctx_dft) = llama_init_from_gpt_params(params);
 
+    {
+        const int n_vocab_tgt = llama_n_vocab(model_tgt);
+        const int n_vocab_dft = llama_n_vocab(model_dft);
+        const int vocab_diff  = n_vocab_tgt > n_vocab_dft
+            ? n_vocab_tgt - n_vocab_dft
+            : n_vocab_dft - n_vocab_tgt;
+
+        if (vocab_diff > SPEC_VOCAB_MAX_SIZE_DIFFERENCE) {
+            fprintf(stderr, "%s: error: draft model vocab must closely match target model to use speculation but ", __func__);
+            fprintf(stderr, "target vocab size %d does not match draft vocab size %d - difference %d, max allowed %d\n",
+                    n_vocab_tgt, llama_n_vocab(model_dft), vocab_diff, SPEC_VOCAB_MAX_SIZE_DIFFERENCE);
+            return 1;
+        }
+
+        for (int i = SPEC_VOCAB_CHECK_START_TOKEN_ID; i < std::min(n_vocab_tgt, n_vocab_dft); ++i) {
+            const char * token_text_tgt = llama_token_get_text(model_tgt, i);
+            const char * token_text_dft = llama_token_get_text(model_dft, i);
+            if (std::strcmp(token_text_tgt, token_text_dft) != 0) {
+                fprintf(stderr, "%s: error: draft model vocab must match target model to use speculation but ", __func__);
+                fprintf(stderr, "token %d content differs - target '%s', draft '%s'\n", i,
+                        llama_token_to_piece(ctx_tgt, i).c_str(),
+                        llama_token_to_piece(ctx_dft, i).c_str());
+                return 1;
+            }
+        }
+    }
+
     // tokenize the prompt
     std::vector<llama_token> inp;
     inp = ::llama_tokenize(ctx_tgt, params.prompt, true);
@@ -77,8 +124,6 @@ int main(int argc, char ** argv) {
     const auto t_enc_end = ggml_time_us();
 
     // the 2 models should have the same vocab
-    const int n_ctx   = llama_n_ctx(ctx_tgt);
-    const int n_vocab = llama_n_vocab(model_tgt);
     //GGML_ASSERT(n_vocab == llama_n_vocab(model_dft));
 
     // how many tokens to draft each time
@@ -91,116 +136,129 @@ int main(int argc, char ** argv) {
     int n_past_tgt = inp.size();
     int n_past_dft = inp.size();
 
-    std::vector<llama_token> drafted;
-
-    std::vector<llama_token> last_tokens(n_ctx);
-    std::fill(last_tokens.begin(), last_tokens.end(), 0);
-
-    for (auto & id : inp) {
-        last_tokens.erase(last_tokens.begin());
-        last_tokens.push_back(id);
-    }
-
-    std::vector<llama_token_data> candidates;
-    candidates.reserve(n_vocab);
-
     // used to determine end of generation
     bool has_eos = false;
 
-    // grammar stuff
-    struct llama_grammar * grammar_dft = NULL;
-    struct llama_grammar * grammar_tgt = NULL;
+    // target model sampling context
+    struct llama_sampling_context * ctx_sampling = llama_sampling_init(params.sparams);
 
-    grammar_parser::parse_state parsed_grammar;
+    // draft sequence data
+    std::vector<seq_draft> drafts(n_seq_dft);
 
-    // if requested - load the grammar, error checking is omitted for brevity
-    if (!params.grammar.empty()) {
-        parsed_grammar = grammar_parser::parse(params.grammar.c_str());
-        // will be empty (default) if there are parse errors
-        if (parsed_grammar.rules.empty()) {
-            return 1;
-        }
+    params.sparams.grammar.clear(); // the draft samplers will copy the target sampler's grammar
+    params.sparams.temp = -1.0f;    // force greedy sampling with probs for the draft model
 
-        std::vector<const llama_grammar_element *> grammar_rules(parsed_grammar.c_rules());
-        grammar_tgt = llama_grammar_init(grammar_rules.data(), grammar_rules.size(), parsed_grammar.symbol_ids.at("root"));
+    for (int s = 0; s < n_seq_dft; ++s) {
+        drafts[s].ctx_sampling = llama_sampling_init(params.sparams);
     }
 
-    llama_sampling_context ctx_sampling = llama_sampling_context_init(params, grammar_tgt);
+    llama_batch batch_dft = llama_batch_init(params.n_ctx, 0, 1);
+    llama_batch batch_tgt = llama_batch_init(params.n_ctx, 0, n_seq_dft);
 
     const auto t_dec_start = ggml_time_us();
 
-    while (true) {
-        LOG("drafted: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_dft, drafted));
+    // sample from the last token of the prompt
+    drafts[0].i_batch_tgt.resize(1);
+    drafts[0].i_batch_tgt[0] = 0;
 
-        int i_dft = 0;
+    while (true) {
+        // print current draft sequences
+        for (int s = 0; s < n_seq_dft; ++s) {
+            if (!drafts[s].active) {
+                continue;
+            }
+
+            const auto & tokens = drafts[s].tokens;
+
+            LOG("draft %d: %s\n", s, LOG_TOKENS_TOSTR_PRETTY(ctx_dft, tokens).c_str());
+        }
+
+        int i_dft  = 0;
+        int s_keep = 0;
 
         while (true) {
+            LOG("sampling target: s_keep = %3d, i_dft = %3d, i_batch_tgt = %3d\n", s_keep, i_dft, drafts[s_keep].i_batch_tgt[i_dft]);
+
             // sample from the target model
-            llama_token id = llama_sampling_sample(ctx_tgt, NULL, ctx_sampling, last_tokens, candidates, i_dft);
+            llama_token id = llama_sampling_sample(ctx_sampling, ctx_tgt, NULL, drafts[s_keep].i_batch_tgt[i_dft]);
 
-            // remember which tokens were sampled - used for repetition penalties during sampling
-            last_tokens.erase(last_tokens.begin());
-            last_tokens.push_back(id);
+            llama_sampling_accept(ctx_sampling, ctx_tgt, id, true);
 
-            //LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_tgt, last_tokens));
+            //LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_tgt, ctx_sampling->prev).c_str());
 
             const std::string token_str = llama_token_to_piece(ctx_tgt, id);
+
             printf("%s", token_str.c_str());
             fflush(stdout);
 
-            if (id == llama_token_eos(ctx_tgt)) {
+            if (id == llama_token_eos(model_tgt)) {
                 has_eos = true;
             }
 
             ++n_predict;
 
-            // check if the draft matches the target
-            if (i_dft < (int) drafted.size() && id == drafted[i_dft]) {
-                LOG("the sampled target token matches the %dth drafted token (%d, '%s') - accepted\n", i_dft, id, token_str.c_str());
-                ++n_accept;
-                ++n_past_tgt;
-                ++n_past_dft;
-                ++i_dft;
-
-                continue;
-            }
-
-            // the drafted token was rejected or we are out of drafted tokens
-
-            if (i_dft < (int) drafted.size()) {
-                LOG("the %dth drafted token (%d, '%s') does not match the sampled target token (%d, '%s') - rejected\n",
-                        i_dft, drafted[i_dft], llama_token_to_piece(ctx_dft, drafted[i_dft]).c_str(), id, token_str.c_str());
-            } else {
-                LOG("out of drafted tokens\n");
-            }
-
-            llama_kv_cache_seq_rm(ctx_dft, 0, n_past_dft, -1);
-            llama_decode(ctx_dft, llama_batch_get_one(&id, 1, n_past_dft, 0));
-            ++n_past_dft;
-
-            // heuristic for n_draft
+            // check if the target token matches any of the drafts
             {
-                const int  n_draft_cur  = (int) drafted.size();
-                const bool all_accepted = i_dft == n_draft_cur;
+                bool matches = false;
 
-                LOG("n_draft      = %d\n", n_draft);
-                LOG("n_draft_cur  = %d\n", n_draft_cur);
-                LOG("i_dft        = %d\n", i_dft);
-                LOG("all_accepted = %d\n", all_accepted);
+                for (int s = 0; s < n_seq_dft; ++s) {
+                    if (!drafts[s].active) {
+                        continue;
+                    }
 
-                if (all_accepted && n_draft == n_draft_cur) {
-                    LOG(" - max drafted tokens accepted - n_draft += 8\n");
-                    n_draft = std::min(30, n_draft + 8);
-                } else if (all_accepted) {
-                    LOG(" - partially drafted tokens accepted - no change\n");
-                } else {
-                    LOG(" - drafted token rejected - n_draft -= 1\n");
-                    n_draft = std::max(2, n_draft - 1);
+                    if (i_dft < (int) drafts[s].tokens.size() && id == drafts[s].tokens[i_dft]) {
+                        LOG("the sampled target token matches the %dth drafted token of sequence %d (%d, '%s') - accepted\n", i_dft, s, id, token_str.c_str());
+
+                        s_keep = s;
+                        matches = true;
+                    } else {
+                        drafts[s].active = false;
+                    }
+                }
+
+                if (matches) {
+                    ++n_accept;
+                    ++n_past_tgt;
+                    ++n_past_dft;
+                    ++i_dft;
+
+                    continue;
                 }
             }
 
-            drafted.clear();
-            drafted.push_back(id);
+            LOG("the sampled target token (%d, '%s') did not match, or we ran out of drafted tokens\n", id, token_str.c_str());
+
+            // TODO: simplify
+            {
+                LOG("keeping sequence %d, n_past_tgt = %d, n_past_dft = %d\n", s_keep, n_past_tgt, n_past_dft);
+
+                llama_kv_cache_seq_keep(ctx_dft, s_keep);
+                llama_kv_cache_seq_cp  (ctx_dft, s_keep, 0, -1, -1);
+                llama_kv_cache_seq_keep(ctx_dft, 0);
+
+                llama_kv_cache_seq_rm  (ctx_tgt, s_keep, n_past_tgt, -1);
+                llama_kv_cache_seq_keep(ctx_tgt, s_keep);
+                llama_kv_cache_seq_cp  (ctx_tgt, s_keep, 0, -1, -1);
+                llama_kv_cache_seq_keep(ctx_tgt, 0);
+            }
+
+            for (int s = 0; s < n_seq_dft; ++s) {
+                drafts[s].active = false;
+                drafts[s].tokens.clear();
+                drafts[s].i_batch_tgt.clear();
+            }
+            // note: will be erased after the speculation phase
+            drafts[0].tokens.push_back(id);
+            drafts[0].i_batch_tgt.push_back(0);
+
+            llama_batch_clear(batch_dft);
+            llama_batch_add  (batch_dft, id, n_past_dft, { 0 }, true);
+
+            llama_kv_cache_seq_rm(ctx_dft, 0, n_past_dft, -1);
+            // LOG("dft batch: %s\n", LOG_BATCH_TOSTR_PRETTY(ctx_dft, batch_dft).c_str());
+            llama_decode         (ctx_dft, batch_dft);
+
+            ++n_past_dft;
 
             break;
         }
@@ -209,78 +267,151 @@ int main(int argc, char ** argv) {
             break;
         }
 
-        if (grammar_tgt) {
-            if (grammar_dft) {
-                llama_grammar_free(grammar_dft);
-            }
-            // Note: Hardcoded to sequence id 0, if this ever supports parallel generation
-            //       that will need to change.
-            auto it = ctx_sampling.sequence_contexts.find(0);
-            GGML_ASSERT(it != ctx_sampling.sequence_contexts.end());
-            // This is necessary because each sequence id in sequence_contexts
-            // uses a copy of the original grammar.
-            grammar_dft = llama_grammar_copy(it->second.grammar);
+        llama_sampling_cp(ctx_sampling, drafts[0].ctx_sampling);
 
-            LOG("copied target grammar to draft grammar\n");
-        }
-
-        // sample n_draft tokens from the draft model using greedy decoding
+        int n_seq_cur  = 1;
         int n_past_cur = n_past_dft;
+
+        for (int s = 0; s < n_seq_dft; ++s) {
+            drafts[s].active   = false;
+            drafts[s].drafting = false;
+        }
+        drafts[0].active      = true;
+        drafts[0].drafting    = true;
+        drafts[0].i_batch_dft = 0;
+
+        llama_batch_clear(batch_tgt);
+        llama_batch_add  (batch_tgt, drafts[0].tokens[0], n_past_tgt, { 0 }, true);
+
+        // sample n_draft tokens from the draft model using tree-based sampling
         for (int i = 0; i < n_draft; ++i) {
-            float * logits = llama_get_logits(ctx_dft);
+            batch_dft.n_tokens = 0;
 
-            candidates.clear();
-            for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
-                candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
+            for (int s = 0; s < n_seq_dft; ++s) {
+                drafts[s].skip = false;
             }
 
-            llama_token_data_array cur_p = { candidates.data(), candidates.size(), false };
+            for (int s = 0; s < n_seq_dft; ++s) {
+                if (!drafts[s].drafting || drafts[s].skip) {
+                    continue;
+                }
 
-            if (grammar_dft != NULL) {
-                llama_sample_grammar(ctx_dft, &cur_p, grammar_dft);
+                llama_sampling_sample(drafts[s].ctx_sampling, ctx_dft, NULL, drafts[s].i_batch_dft);
+
+                const auto & cur_p = drafts[s].ctx_sampling->cur;
+
+                for (int k = 0; k < std::min(n_seq_dft + 3, (int) cur_p.size()); ++k) {
+                    LOG(" - draft candidate %3d for seq %3d, pos %3d: %6d (%8.3f) '%s'\n",
+                            k, s, i, cur_p[k].id, cur_p[k].p, llama_token_to_piece(ctx_dft, cur_p[k].id).c_str());
+                }
+
+                if (cur_p[0].p < p_accept) {
+                    LOG("stopping drafting for seq %3d, probability too low: %.3f < %.3f\n", s, cur_p[0].p, p_accept);
+                    drafts[s].drafting = false;
+                    continue;
+                }
+
+                std::vector<int> sa(1, s);
+
+                // attempt to split the branch if the probability is high enough
+                for (int f = 1; f < 8; ++f) {
+                    if (n_seq_cur < n_seq_dft && cur_p[f].p > p_split) {
+                        LOG("splitting seq %3d into %3d\n", s, n_seq_cur);
+
+                        llama_kv_cache_seq_rm(ctx_dft,    n_seq_cur, -1, -1);
+                        llama_kv_cache_seq_cp(ctx_dft, s, n_seq_cur, -1, -1);
+
+                        // all previous tokens from this branch are now also part of the new branch
+                        for (int t = 0; t < batch_tgt.n_tokens; ++t) {
+                            for (int p = 0; p < batch_tgt.n_seq_id[t]; ++p) {
+                                if (batch_tgt.seq_id[t][p] == s) {
+                                    batch_tgt.seq_id[t][batch_tgt.n_seq_id[t]] = n_seq_cur;
+                                    batch_tgt.n_seq_id[t]++;
+                                    break;
+                                }
+                            }
+                        }
+
+                        // copy the draft state
+                        drafts[n_seq_cur].active   = true;
+                        drafts[n_seq_cur].drafting = true;
+                        drafts[n_seq_cur].skip     = true;
+
+                        drafts[n_seq_cur].tokens      = drafts[s].tokens;
+                        drafts[n_seq_cur].i_batch_dft = drafts[s].i_batch_dft;
+                        drafts[n_seq_cur].i_batch_tgt = drafts[s].i_batch_tgt;
+
+                        llama_sampling_cp(drafts[s].ctx_sampling, drafts[n_seq_cur].ctx_sampling);
+
+                        sa.push_back(n_seq_cur);
+
+                        n_seq_cur++;
+                    } else {
+                        break;
+                    }
+                }
+
+                // add drafted token for each sequence
+                for (int is = 0; is < (int) sa.size(); ++is) {
+                    const llama_token id = cur_p[is].id;
+
+                    const int s = sa[is];
+
+                    llama_sampling_accept(drafts[s].ctx_sampling, ctx_dft, id, true);
+
+                    drafts[s].tokens.push_back(id);
+
+                    // add unique drafted tokens to the target batch
+                    drafts[s].i_batch_tgt.push_back(batch_tgt.n_tokens);
+
+                    llama_batch_add(batch_tgt, id, n_past_tgt + i + 1, { s }, true);
+
+                    // add the token to the batch for batched decoding with the draft model
+                    drafts[s].i_batch_dft = batch_dft.n_tokens;
+
+                    llama_batch_add(batch_dft, id, n_past_cur, { s }, true);
+
+                    if (batch_tgt.n_tokens > n_draft) {
+                        drafts[s].drafting = false;
+                    }
+                }
             }
 
-            // computes softmax and sorts the candidates
-            llama_sample_softmax(ctx_dft, &cur_p);
-
-            for (int i = 0; i < 3; ++i) {
-                LOG(" - draft candidate %3d: %6d (%8.3f) '%s'\n", i, cur_p.data[i].id, cur_p.data[i].p, llama_token_to_piece(ctx_dft, cur_p.data[i].id).c_str());
-            }
-
-            // TODO: better logic?
-            if (cur_p.data[0].p < 2*cur_p.data[1].p) {
-                LOG("stopping drafting, probability too low: %.3f < 2*%.3f\n", cur_p.data[0].p, cur_p.data[1].p);
+            // no sequence is drafting anymore
+            if (batch_dft.n_tokens == 0) {
                 break;
             }
 
-            // drafted token
-            const llama_token id = cur_p.data[0].id;
-
-            drafted.push_back(id);
+            // evaluate the drafted tokens on the draft model
+            llama_decode(ctx_dft, batch_dft);
+            ++n_past_cur;
             ++n_drafted;
 
-            // no need to evaluate the last drafted token, since we won't use the result
-            if (i == n_draft - 1) {
+            if (batch_tgt.n_tokens > n_draft) {
                 break;
             }
-
-            // evaluate the drafted token on the draft model
-            llama_kv_cache_seq_rm(ctx_dft, 0, n_past_cur, -1);
-            llama_decode(ctx_dft, llama_batch_get_one(&drafted.back(), 1, n_past_cur, 0));
-            ++n_past_cur;
-
-            if (grammar_dft != NULL) {
-                llama_grammar_accept_token(ctx_dft, grammar_dft, id);
-            }
         }
 
         // evaluate the target model on the drafted tokens
-        llama_kv_cache_seq_rm(ctx_tgt, 0, n_past_tgt, -1);
-        llama_decode(ctx_tgt, llama_batch_get_one(drafted.data(), drafted.size(), n_past_tgt, 0));
-        ++n_past_tgt;
+        {
+            llama_kv_cache_seq_keep(ctx_tgt, 0);
+            for (int s = 1; s < n_seq_dft; ++s) {
+                llama_kv_cache_seq_cp(ctx_tgt, 0, s, -1, -1);
+            }
 
-        // the first token is always proposed by the traget model before the speculation loop
-        drafted.erase(drafted.begin());
+            // LOG("target batch: %s\n", LOG_BATCH_TOSTR_PRETTY(ctx_tgt, batch_tgt).c_str());
+            llama_decode(ctx_tgt, batch_tgt);
+            ++n_past_tgt;
+        }
+
+        // the first token is always proposed by the traget model before the speculation loop so we erase it here
+        for (int s = 0; s < n_seq_dft; ++s) {
+            if (!drafts[s].active) {
+                continue;
+            }
+
+            drafts[s].tokens.erase(drafts[s].tokens.begin());
+        }
     }
 
     auto t_dec_end = ggml_time_us();
@@ -288,9 +419,8 @@ int main(int argc, char ** argv) {
     LOG_TEE("\n\n");
 
     LOG_TEE("encoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_input,   (t_enc_end - t_enc_start) / 1e6f, inp.size() / ((t_enc_end - t_enc_start) / 1e6f));
-    LOG_TEE("decoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_predict, (t_dec_end - t_dec_start) / 1e6f, n_predict / ((t_dec_end - t_dec_start) / 1e6f));
+    LOG_TEE("decoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_predict, (t_dec_end - t_dec_start) / 1e6f, n_predict  / ((t_dec_end - t_dec_start) / 1e6f));
 
-    // TODO: make sure these numbers are computed correctly
     LOG_TEE("\n");
     LOG_TEE("n_draft   = %d\n", n_draft);
     LOG_TEE("n_predict = %d\n", n_predict);
@@ -304,16 +434,19 @@ int main(int argc, char ** argv) {
     LOG_TEE("\ntarget:\n");
     llama_print_timings(ctx_tgt);
 
+    llama_sampling_free(ctx_sampling);
+    for (int s = 0; s < n_seq_dft; ++s) {
+        llama_sampling_free(drafts[s].ctx_sampling);
+    }
+
+    llama_batch_free(batch_dft);
+
     llama_free(ctx_tgt);
     llama_free_model(model_tgt);
 
     llama_free(ctx_dft);
     llama_free_model(model_dft);
 
-    if (grammar_dft != NULL) {
-        llama_grammar_free(grammar_dft);
-        llama_grammar_free(grammar_tgt);
-    }
     llama_backend_free();
 
     fprintf(stderr, "\n\n");
diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp
index be693b3ac..2a257e632 100644
--- a/examples/train-text-from-scratch/train-text-from-scratch.cpp
+++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp
@@ -253,13 +253,14 @@ static void init_model(struct my_llama_model * model) {
     set_param_model(model);
 
     // measure data size
-    struct ggml_allocr * alloc = NULL;
-    alloc = ggml_allocr_new_measure(tensor_alignment);
-    alloc_model(alloc, model);
+    size_t size = 0;
+    for (struct ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
+        size += GGML_PAD(ggml_nbytes(t), tensor_alignment);
+    }
 
     // allocate data
-    model->data.resize(ggml_allocr_max_size(alloc) + tensor_alignment);
-    ggml_allocr_free(alloc);
+    struct ggml_allocr * alloc = NULL;
+    model->data.resize(size + tensor_alignment);
     alloc = ggml_allocr_new(model->data.data(), model->data.size(), tensor_alignment);
     alloc_model(alloc, model);
     ggml_allocr_free(alloc);
@@ -348,9 +349,9 @@ static struct ggml_tensor * llama_build_train_graphs(
         // not capturing these, to silcence warnings
         const int rope_mode = 0;
 
-        return ggml_rope_custom(ctx,
-            t, KQ_pos, n_rot, rope_mode, n_ctx,
-            rope_freq_base, rope_freq_scale);
+        return ggml_rope_custom(
+            ctx, t, KQ_pos, n_rot, rope_mode, n_ctx, 0, rope_freq_base, rope_freq_scale, 0.0f, 1.0f, 0.0f, 0.0f
+        );
     };
 
     set_name(tokens_input, "tokens_input");
@@ -1094,11 +1095,9 @@ int main(int argc, char ** argv) {
     struct ggml_tensor * target_probs  = ggml_new_tensor_3d(ctx_input, GGML_TYPE_F32, n_vocab,  n_tokens, n_batch);
 
     // measure required memory for input tensors
-    alloc = ggml_allocr_new_measure(tensor_alignment);
-    ggml_allocr_alloc(alloc, tokens_input);
-    ggml_allocr_alloc(alloc, target_probs);
-    size_t max_input_size = ggml_allocr_max_size(alloc) + tensor_alignment;
-    ggml_allocr_free(alloc);
+    size_t max_input_size = GGML_PAD(ggml_nbytes(tokens_input), tensor_alignment) +
+                            GGML_PAD(ggml_nbytes(target_probs), tensor_alignment) +
+                            tensor_alignment;
     printf("%s: input_size = %zu bytes (%.1f MB)\n", __func__, max_input_size, (float) max_input_size / (1024.0f*1024.0f));
 
     // allocate input tensors
diff --git a/flake.lock b/flake.lock
index a7777d05d..0455f6561 100644
--- a/flake.lock
+++ b/flake.lock
@@ -5,11 +5,11 @@
         "systems": "systems"
       },
       "locked": {
-        "lastModified": 1692799911,
-        "narHash": "sha256-3eihraek4qL744EvQXsK1Ha6C3CR7nnT8X2qWap4RNk=",
+        "lastModified": 1694529238,
+        "narHash": "sha256-zsNZZGTGnMOf9YpHKJqMSsa0dXbfmxeoJ7xHlrt+xmY=",
         "owner": "numtide",
         "repo": "flake-utils",
-        "rev": "f9e7cf818399d17d347f847525c5a5a8032e4e44",
+        "rev": "ff7b65b44d01cf9ba6a71320833626af21126384",
         "type": "github"
       },
       "original": {
@@ -20,11 +20,11 @@
     },
     "nixpkgs": {
       "locked": {
-        "lastModified": 1692913444,
-        "narHash": "sha256-1SvMQm2DwofNxXVtNWWtIcTh7GctEVrS/Xel/mdc6iY=",
+        "lastModified": 1698318101,
+        "narHash": "sha256-gUihHt3yPD7bVqg+k/UVHgngyaJ3DMEBchbymBMvK1E=",
         "owner": "NixOS",
         "repo": "nixpkgs",
-        "rev": "18324978d632ffc55ef1d928e81630c620f4f447",
+        "rev": "63678e9f3d3afecfeafa0acead6239cdb447574c",
         "type": "github"
       },
       "original": {
diff --git a/flake.nix b/flake.nix
index cfc4776a4..4cf28d5c1 100644
--- a/flake.nix
+++ b/flake.nix
@@ -11,8 +11,7 @@
         meta.mainProgram = "llama";
         inherit (pkgs.stdenv) isAarch32 isAarch64 isDarwin;
         buildInputs = with pkgs; [ openmpi ];
-        osSpecific = with pkgs; buildInputs ++
-        (
+        osSpecific = with pkgs; buildInputs ++ (
           if isAarch64 && isDarwin then
             with pkgs.darwin.apple_sdk_11_0.frameworks; [
               Accelerate
@@ -51,6 +50,9 @@
         };
         llama-python =
           pkgs.python3.withPackages (ps: with ps; [ numpy sentencepiece ]);
+        # TODO(Green-Sky): find a better way to opt-into the heavy ml python runtime
+        llama-python-extra =
+          pkgs.python3.withPackages (ps: with ps; [ numpy sentencepiece torchWithoutCuda transformers ]);
         postPatch = ''
           substituteInPlace ./ggml-metal.m \
             --replace '[bundle pathForResource:@"ggml-metal" ofType:@"metal"];' "@\"$out/bin/ggml-metal.metal\";"
@@ -93,12 +95,15 @@
         };
         packages.rocm = pkgs.stdenv.mkDerivation {
           inherit name src meta postPatch nativeBuildInputs postInstall;
-          buildInputs = with pkgs; buildInputs ++ [ hip hipblas rocblas ];
+          buildInputs = with pkgs.rocmPackages; buildInputs ++ [ clr hipblas rocblas ];
           cmakeFlags = cmakeFlags ++ [
             "-DLLAMA_HIPBLAS=1"
             "-DCMAKE_C_COMPILER=hipcc"
             "-DCMAKE_CXX_COMPILER=hipcc"
-            "-DCMAKE_POSITION_INDEPENDENT_CODE=ON"
+            # Build all targets supported by rocBLAS. When updating search for TARGET_LIST_ROCM
+            # in github.com/ROCmSoftwarePlatform/rocBLAS/blob/develop/CMakeLists.txt
+            # and select the line that matches the current nixpkgs version of rocBLAS.
+            "-DAMDGPU_TARGETS=gfx803;gfx900;gfx906:xnack-;gfx908:xnack-;gfx90a:xnack+;gfx90a:xnack-;gfx940;gfx941;gfx942;gfx1010;gfx1012;gfx1030;gfx1100;gfx1101;gfx1102"
           ];
         };
         apps.llama-server = {
@@ -126,5 +131,9 @@
           buildInputs = [ llama-python ];
           packages = nativeBuildInputs ++ osSpecific;
         };
+        devShells.extra = pkgs.mkShell {
+          buildInputs = [ llama-python-extra ];
+          packages = nativeBuildInputs ++ osSpecific;
+        };
       });
 }
diff --git a/ggml-cuda.cu b/ggml-cuda.cu
index 654d3632f..baf02df2b 100644
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@@ -29,6 +29,8 @@
 #define __shfl_xor_sync(mask, var, laneMask, width) __shfl_xor(var, laneMask, width)
 #define cublasCreate hipblasCreate
 #define cublasGemmEx hipblasGemmEx
+#define cublasGemmBatchedEx hipblasGemmBatchedEx
+#define cublasGemmStridedBatchedEx hipblasGemmStridedBatchedEx
 #define cublasHandle_t hipblasHandle_t
 #define cublasSetMathMode(handle, mode) CUBLAS_STATUS_SUCCESS
 #define cublasSetStream hipblasSetStream
@@ -37,6 +39,10 @@
 #define cudaDeviceCanAccessPeer hipDeviceCanAccessPeer
 #define cudaDeviceDisablePeerAccess hipDeviceDisablePeerAccess
 #define cudaDeviceEnablePeerAccess hipDeviceEnablePeerAccess
+#define cudaDeviceGetMemPool hipDeviceGetMemPool
+#define cudaMemPoolAttrReleaseThreshold hipMemPoolAttrReleaseThreshold
+#define cudaMemPoolSetAttribute hipMemPoolSetAttribute
+#define cudaMemPool_t hipMemPool_t
 #define cudaDeviceProp hipDeviceProp_t
 #define cudaDeviceSynchronize hipDeviceSynchronize
 #define cudaError_t hipError_t
@@ -46,6 +52,7 @@
 #define cudaEvent_t hipEvent_t
 #define cudaEventDestroy hipEventDestroy
 #define cudaFree hipFree
+#define cudaFreeAsync hipFreeAsync
 #define cudaFreeHost hipHostFree
 #define cudaGetDevice hipGetDevice
 #define cudaGetDeviceCount hipGetDeviceCount
@@ -53,6 +60,7 @@
 #define cudaGetErrorString hipGetErrorString
 #define cudaGetLastError hipGetLastError
 #define cudaMalloc hipMalloc
+#define cudaMallocFromPoolAsync hipMallocFromPoolAsync
 #define cudaMallocHost(ptr, size) hipHostMalloc(ptr, size, hipHostMallocDefault)
 #define cudaMemcpy hipMemcpy
 #define cudaMemcpy2DAsync hipMemcpy2DAsync
@@ -85,6 +93,24 @@
 #define CC_OFFSET_AMD 1000000
 #define CC_RDNA2      (CC_OFFSET_AMD + 1030)
 
+// define this if you want to always fallback to MMQ kernels and not use cuBLAS for matrix multiplication
+// on modern hardware, using cuBLAS is recommended as it utilizes F16 tensor cores which are very performant
+// for large computational tasks. the drawback is that this requires some extra amount of VRAM:
+// -  7B quantum model: +100-200 MB
+// - 13B quantum model: +200-400 MB
+//
+//#define GGML_CUDA_FORCE_MMQ
+
+// TODO: improve this to be correct for more hardware
+//       for example, currently fails for GeForce GTX 1660 which is TURING arch (> VOLTA) but does not have tensor cores
+//       probably other such cases, and not sure what happens on AMD hardware
+#if !defined(GGML_CUDA_FORCE_MMQ)
+#define CUDA_USE_TENSOR_CORES
+#endif
+
+// max batch size to use MMQ kernels when tensor cores are available
+#define MMQ_MAX_BATCH_SIZE 32
+
 #if defined(GGML_USE_HIPBLAS)
 #define __CUDA_ARCH__ 1300
 
@@ -161,11 +187,11 @@ static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");
     do {                                                                                \
         cudaError_t err_ = (err);                                                       \
         if (err_ != cudaSuccess) {                                                      \
-            int id;                                                                     \
-            cudaGetDevice(&id);                                                         \
+            int dev_id;                                                                     \
+            cudaGetDevice(&dev_id);                                                         \
             fprintf(stderr, "\nCUDA error %d at %s:%d: %s\n", err_, __FILE__, __LINE__, \
                 cudaGetErrorString(err_));                                              \
-            fprintf(stderr, "current device: %d\n", id);                                \
+            fprintf(stderr, "current device: %d\n", dev_id);                                \
             exit(1);                                                                    \
         }                                                                               \
     } while (0)
@@ -175,11 +201,11 @@ static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");
     do {                                                                                \
         cublasStatus_t err_ = (err);                                                    \
         if (err_ != CUBLAS_STATUS_SUCCESS) {                                            \
-            int id;                                                                     \
-            cudaGetDevice(&id);                                                         \
+            int dev_id;                                                                     \
+            cudaGetDevice(&dev_id);                                                         \
             fprintf(stderr, "\ncuBLAS error %d at %s:%d: %s\n",                         \
                     err_, __FILE__, __LINE__, cublasGetStatusString(err_));             \
-            fprintf(stderr, "current device: %d\n", id);                                \
+            fprintf(stderr, "current device: %d\n", dev_id);                                \
             exit(1);                                                                    \
         }                                                                               \
     } while (0)
@@ -445,6 +471,7 @@ static_assert(K_QUANTS_PER_ITERATION == 1 || K_QUANTS_PER_ITERATION == 2, "K_QUA
 
 #define MAX_STREAMS 8
 static cudaStream_t g_cudaStreams[GGML_CUDA_MAX_DEVICES][MAX_STREAMS] = { nullptr };
+static cudaMemPool_t g_cudaMemPools[GGML_CUDA_MAX_DEVICES] = { nullptr };
 
 struct ggml_tensor_extra_gpu {
     void * data_device[GGML_CUDA_MAX_DEVICES]; // 1 pointer for each device for split tensors
@@ -468,7 +495,6 @@ static int g_device_count = -1;
 static int g_main_device = 0;
 static int g_compute_capabilities[GGML_CUDA_MAX_DEVICES];
 static float g_tensor_split[GGML_CUDA_MAX_DEVICES] = {0};
-static bool g_mul_mat_q = true;
 
 static void * g_scratch_buffer = nullptr;
 static size_t g_scratch_size = 0; // disabled by default
@@ -494,6 +520,15 @@ static __global__ void add_f16_f32_f16(const half * x, const float * y, half * d
     dst[i] = __hadd(x[i], __float2half(y[i]));
 }
 
+static __global__ void add_f16_f32_f32(const half * x, const float * y, float * dst, const int k) {
+    const int i = blockDim.x*blockIdx.x + threadIdx.x;
+
+    if (i >= k) {
+        return;
+    }
+    dst[i] = __half2float(x[i]) + y[i];
+}
+
 static __global__ void mul_f32(const float * x, const float * y, float * dst, const int kx, const int ky) {
     const int i = blockDim.x*blockIdx.x + threadIdx.x;
 
@@ -3552,9 +3587,15 @@ static __device__ __forceinline__ void mul_mat_q(
 #define  MMQ_X_Q4_0_RDNA1  64
 #define  MMQ_Y_Q4_0_RDNA1  64
 #define NWARPS_Q4_0_RDNA1  8
+#if defined(CUDA_USE_TENSOR_CORES)
+#define  MMQ_X_Q4_0_AMPERE 4
+#define  MMQ_Y_Q4_0_AMPERE 32
+#define NWARPS_Q4_0_AMPERE 4
+#else
 #define  MMQ_X_Q4_0_AMPERE 64
 #define  MMQ_Y_Q4_0_AMPERE 128
 #define NWARPS_Q4_0_AMPERE 4
+#endif
 #define  MMQ_X_Q4_0_PASCAL 64
 #define  MMQ_Y_Q4_0_PASCAL 64
 #define NWARPS_Q4_0_PASCAL 8
@@ -3613,9 +3654,15 @@ template <bool need_check> static __global__ void
 #define  MMQ_X_Q4_1_RDNA1  64
 #define  MMQ_Y_Q4_1_RDNA1  64
 #define NWARPS_Q4_1_RDNA1  8
+#if defined(CUDA_USE_TENSOR_CORES)
+#define  MMQ_X_Q4_1_AMPERE 4
+#define  MMQ_Y_Q4_1_AMPERE 32
+#define NWARPS_Q4_1_AMPERE 4
+#else
 #define  MMQ_X_Q4_1_AMPERE 64
 #define  MMQ_Y_Q4_1_AMPERE 128
 #define NWARPS_Q4_1_AMPERE 4
+#endif
 #define  MMQ_X_Q4_1_PASCAL 64
 #define  MMQ_Y_Q4_1_PASCAL 64
 #define NWARPS_Q4_1_PASCAL 8
@@ -3676,9 +3723,15 @@ template <bool need_check> static __global__ void
 #define  MMQ_X_Q5_0_RDNA1  64
 #define  MMQ_Y_Q5_0_RDNA1  64
 #define NWARPS_Q5_0_RDNA1  8
+#if defined(CUDA_USE_TENSOR_CORES)
+#define  MMQ_X_Q5_0_AMPERE 4
+#define  MMQ_Y_Q5_0_AMPERE 32
+#define NWARPS_Q5_0_AMPERE 4
+#else
 #define  MMQ_X_Q5_0_AMPERE 128
 #define  MMQ_Y_Q5_0_AMPERE 64
 #define NWARPS_Q5_0_AMPERE 4
+#endif
 #define  MMQ_X_Q5_0_PASCAL 64
 #define  MMQ_Y_Q5_0_PASCAL 64
 #define NWARPS_Q5_0_PASCAL 8
@@ -3737,9 +3790,15 @@ template <bool need_check> static __global__ void
 #define  MMQ_X_Q5_1_RDNA1  64
 #define  MMQ_Y_Q5_1_RDNA1  64
 #define NWARPS_Q5_1_RDNA1  8
+#if defined(CUDA_USE_TENSOR_CORES)
+#define  MMQ_X_Q5_1_AMPERE 4
+#define  MMQ_Y_Q5_1_AMPERE 32
+#define NWARPS_Q5_1_AMPERE 4
+#else
 #define  MMQ_X_Q5_1_AMPERE 128
 #define  MMQ_Y_Q5_1_AMPERE 64
 #define NWARPS_Q5_1_AMPERE 4
+#endif
 #define  MMQ_X_Q5_1_PASCAL 64
 #define  MMQ_Y_Q5_1_PASCAL 64
 #define NWARPS_Q5_1_PASCAL 8
@@ -3798,9 +3857,15 @@ mul_mat_q5_1(
 #define  MMQ_X_Q8_0_RDNA1  64
 #define  MMQ_Y_Q8_0_RDNA1  64
 #define NWARPS_Q8_0_RDNA1  8
+#if defined(CUDA_USE_TENSOR_CORES)
+#define  MMQ_X_Q8_0_AMPERE 4
+#define  MMQ_Y_Q8_0_AMPERE 32
+#define NWARPS_Q8_0_AMPERE 4
+#else
 #define  MMQ_X_Q8_0_AMPERE 128
 #define  MMQ_Y_Q8_0_AMPERE 64
 #define NWARPS_Q8_0_AMPERE 4
+#endif
 #define  MMQ_X_Q8_0_PASCAL 64
 #define  MMQ_Y_Q8_0_PASCAL 64
 #define NWARPS_Q8_0_PASCAL 8
@@ -3859,9 +3924,15 @@ template <bool need_check> static __global__ void
 #define  MMQ_X_Q2_K_RDNA1  128
 #define  MMQ_Y_Q2_K_RDNA1  32
 #define NWARPS_Q2_K_RDNA1  8
+#if defined(CUDA_USE_TENSOR_CORES)
+#define  MMQ_X_Q2_K_AMPERE 4
+#define  MMQ_Y_Q2_K_AMPERE 32
+#define NWARPS_Q2_K_AMPERE 4
+#else
 #define  MMQ_X_Q2_K_AMPERE 64
 #define  MMQ_Y_Q2_K_AMPERE 128
 #define NWARPS_Q2_K_AMPERE 4
+#endif
 #define  MMQ_X_Q2_K_PASCAL 64
 #define  MMQ_Y_Q2_K_PASCAL 64
 #define NWARPS_Q2_K_PASCAL 8
@@ -3920,9 +3991,15 @@ mul_mat_q2_K(
 #define  MMQ_X_Q3_K_RDNA1  32
 #define  MMQ_Y_Q3_K_RDNA1  128
 #define NWARPS_Q3_K_RDNA1  8
+#if defined(CUDA_USE_TENSOR_CORES)
+#define  MMQ_X_Q3_K_AMPERE 4
+#define  MMQ_Y_Q3_K_AMPERE 32
+#define NWARPS_Q3_K_AMPERE 4
+#else
 #define  MMQ_X_Q3_K_AMPERE 128
 #define  MMQ_Y_Q3_K_AMPERE 128
 #define NWARPS_Q3_K_AMPERE 4
+#endif
 #define  MMQ_X_Q3_K_PASCAL 64
 #define  MMQ_Y_Q3_K_PASCAL 64
 #define NWARPS_Q3_K_PASCAL 8
@@ -3983,9 +4060,15 @@ template <bool need_check> static __global__ void
 #define  MMQ_X_Q4_K_RDNA1  32
 #define  MMQ_Y_Q4_K_RDNA1  64
 #define NWARPS_Q4_K_RDNA1  8
+#if defined(CUDA_USE_TENSOR_CORES)
+#define  MMQ_X_Q4_K_AMPERE 4
+#define  MMQ_Y_Q4_K_AMPERE 32
+#define NWARPS_Q4_K_AMPERE 4
+#else
 #define  MMQ_X_Q4_K_AMPERE 64
 #define  MMQ_Y_Q4_K_AMPERE 128
 #define NWARPS_Q4_K_AMPERE 4
+#endif
 #define  MMQ_X_Q4_K_PASCAL 64
 #define  MMQ_Y_Q4_K_PASCAL 64
 #define NWARPS_Q4_K_PASCAL 8
@@ -4046,9 +4129,15 @@ template <bool need_check> static __global__ void
 #define  MMQ_X_Q5_K_RDNA1  32
 #define  MMQ_Y_Q5_K_RDNA1  64
 #define NWARPS_Q5_K_RDNA1  8
+#if defined(CUDA_USE_TENSOR_CORES)
+#define  MMQ_X_Q5_K_AMPERE 4
+#define  MMQ_Y_Q5_K_AMPERE 32
+#define NWARPS_Q5_K_AMPERE 4
+#else
 #define  MMQ_X_Q5_K_AMPERE 64
 #define  MMQ_Y_Q5_K_AMPERE 128
 #define NWARPS_Q5_K_AMPERE 4
+#endif
 #define  MMQ_X_Q5_K_PASCAL 64
 #define  MMQ_Y_Q5_K_PASCAL 64
 #define NWARPS_Q5_K_PASCAL 8
@@ -4107,9 +4196,15 @@ mul_mat_q5_K(
 #define  MMQ_X_Q6_K_RDNA1  32
 #define  MMQ_Y_Q6_K_RDNA1  64
 #define NWARPS_Q6_K_RDNA1  8
+#if defined(CUDA_USE_TENSOR_CORES)
+#define  MMQ_X_Q6_K_AMPERE 4
+#define  MMQ_Y_Q6_K_AMPERE 32
+#define NWARPS_Q6_K_AMPERE 4
+#else
 #define  MMQ_X_Q6_K_AMPERE 64
 #define  MMQ_Y_Q6_K_AMPERE 64
 #define NWARPS_Q6_K_AMPERE 4
+#endif
 #define  MMQ_X_Q6_K_PASCAL 64
 #define  MMQ_Y_Q6_K_PASCAL 64
 #define NWARPS_Q6_K_PASCAL 8
@@ -4326,13 +4421,13 @@ static __global__ void mul_mat_vec_nc_f16_f32( // nc == non-contiguous
 
     const half * x = (const half *) vx;
 
-    const int row_x = blockDim.y*blockIdx.y + threadIdx.y;
-    const int channel = blockDim.z*blockIdx.z + threadIdx.z;
+    const int row_x     = blockDim.y*blockIdx.y + threadIdx.y;
+    const int channel   = blockDim.z*blockIdx.z + threadIdx.z;
     const int channel_x = channel / channel_x_divisor;
 
-    const int nrows_y = ncols_x;
+    const int nrows_y   = ncols_x;
     const int nrows_dst = nrows_x;
-    const int row_dst = row_x;
+    const int row_dst   = row_x;
 
     const int idst = channel*nrows_dst + row_dst;
 
@@ -4345,13 +4440,13 @@ static __global__ void mul_mat_vec_nc_f16_f32( // nc == non-contiguous
             break;
         }
 
-        const int ix = channel_x*channel_stride_x + row_x*row_stride_x + col_x;
-        const float xi = __half2float(x[ix]);
-
         const int row_y = col_x;
 
+        const int ix = channel_x*channel_stride_x + row_x*row_stride_x + col_x;
         const int iy = channel*nrows_y + row_y;
 
+        const float xi = __half2float(x[ix]);
+
         tmp += xi * y[iy];
     }
 
@@ -4405,11 +4500,41 @@ static __global__ void cpy_f32_f16(const char * cx, char * cdst, const int ne,
     cpy_1(cx + x_offset, cdst + dst_offset);
 }
 
-// rope == RoPE == rotary positional embedding
+static __device__ float rope_yarn_ramp(const float low, const float high, const int i0) {
+    const float y = (i0 / 2 - low) / max(0.001f, high - low);
+    return 1.0f - min(1.0f, max(0.0f, y));
+}
 
+struct rope_corr_dims {
+    float v[4];
+};
+
+// YaRN algorithm based on LlamaYaRNScaledRotaryEmbedding.py from https://github.com/jquesnelle/yarn
+// MIT licensed. Copyright (c) 2023 Jeffrey Quesnelle and Bowen Peng.
+static __device__ void rope_yarn(
+    float theta_extrap, float freq_scale, rope_corr_dims corr_dims, int64_t i0, float ext_factor, float mscale,
+    float * cos_theta, float * sin_theta
+) {
+    // Get n-d rotational scaling corrected for extrapolation
+    float theta_interp = freq_scale * theta_extrap;
+    float theta = theta_interp;
+    if (ext_factor != 0.0f) {
+        float ramp_mix = rope_yarn_ramp(corr_dims.v[0], corr_dims.v[1], i0) * ext_factor;
+        theta = theta_interp * (1 - ramp_mix) + theta_extrap * ramp_mix;
+
+        // Get n-d magnitude scaling corrected for interpolation
+        mscale *= 1.0f + 0.1f * logf(1.0f / freq_scale);
+    }
+    *cos_theta = cosf(theta) * mscale;
+    *sin_theta = sinf(theta) * mscale;
+}
+
+// rope == RoPE == rotary positional embedding
 template<typename T, bool has_pos>
-static __global__ void rope(const T * x, T * dst, const int ncols, const int32_t * pos, const float freq_scale,
-                            const int p_delta_rows, const float theta_scale) {
+static __global__ void rope(
+    const T * x, T * dst, int ncols, const int32_t * pos, float freq_scale, int p_delta_rows, float freq_base,
+    float ext_factor, float attn_factor, rope_corr_dims corr_dims
+) {
     const int col = 2*(blockDim.y*blockIdx.y + threadIdx.y);
 
     if (col >= ncols) {
@@ -4421,10 +4546,10 @@ static __global__ void rope(const T * x, T * dst, const int ncols, const int32_t
     const int i2 = row/p_delta_rows;
 
     const int p = has_pos ? pos[i2] : 0;
-    const float p0 = p*freq_scale;
-    const float theta = p0*powf(theta_scale, col/2);
-    const float sin_theta = sinf(theta);
-    const float cos_theta = cosf(theta);
+    const float theta_base = p*powf(freq_base, -float(col)/ncols);
+
+    float cos_theta, sin_theta;
+    rope_yarn(theta_base, freq_scale, corr_dims, col, ext_factor, attn_factor, &cos_theta, &sin_theta);
 
     const float x0 = x[i + 0];
     const float x1 = x[i + 1];
@@ -4434,8 +4559,10 @@ static __global__ void rope(const T * x, T * dst, const int ncols, const int32_t
 }
 
 template<typename T, bool has_pos>
-static __global__ void rope_neox(const T * x, T * dst, const int ncols, const int32_t * pos, const float freq_scale,
-                                 const int p_delta_rows, const float theta_scale) {
+static __global__ void rope_neox(
+    const T * x, T * dst, int ncols, const int32_t * pos, float freq_scale, int p_delta_rows, float freq_base,
+    float ext_factor, float attn_factor, rope_corr_dims corr_dims
+) {
     const int col = 2*(blockDim.y*blockIdx.y + threadIdx.y);
 
     if (col >= ncols) {
@@ -4446,11 +4573,14 @@ static __global__ void rope_neox(const T * x, T * dst, const int ncols, const in
     const int i = row*ncols + col/2;
     const int i2 = row/p_delta_rows;
 
+    // simplified from `(ib * ncols + col) * (-1 / ncols)`, where ib is assumed to be zero
+    const float cur_rot = -float(col)/ncols;
+
     const int p = has_pos ? pos[i2] : 0;
-    const float p0 = p*freq_scale;
-    const float theta = p0*powf(theta_scale, col/2);
-    const float sin_theta = sinf(theta);
-    const float cos_theta = cosf(theta);
+    const float theta_base = p*powf(freq_base, cur_rot);
+
+    float cos_theta, sin_theta;
+    rope_yarn(theta_base, freq_scale, corr_dims, cur_rot, ext_factor, attn_factor, &cos_theta, &sin_theta);
 
     const float x0 = x[i + 0];
     const float x1 = x[i + ncols/2];
@@ -4459,8 +4589,10 @@ static __global__ void rope_neox(const T * x, T * dst, const int ncols, const in
     dst[i + ncols/2] = x0*sin_theta + x1*cos_theta;
 }
 
-static __global__ void rope_glm_f32(const float * x, float * dst, const int ncols, const int32_t * pos, const float freq_scale,
-                                    const int p_delta_rows, const float theta_scale, const int n_ctx) {
+static __global__ void rope_glm_f32(
+    const float * x, float * dst, int ncols, const int32_t * pos, float freq_scale, int p_delta_rows, float freq_base,
+    int n_ctx
+) {
     const int col = blockDim.x*blockIdx.x + threadIdx.x;
     const int half_n_dims = ncols/4;
 
@@ -4472,7 +4604,7 @@ static __global__ void rope_glm_f32(const float * x, float * dst, const int ncol
     const int i = row*ncols + col;
     const int i2 = row/p_delta_rows;
 
-    const float col_theta_scale = powf(theta_scale, col);
+    const float col_theta_scale = powf(freq_base, -2.0f*col/ncols);
      // FIXME: this is likely wrong
     const int p = pos != nullptr ? pos[i2] : 0;
 
@@ -4614,6 +4746,11 @@ static void add_f16_f32_f16_cuda(const half * x, const float * y, half * dst, co
     add_f16_f32_f16<<<num_blocks, CUDA_ADD_BLOCK_SIZE, 0, stream>>>(x, y, dst, k);
 }
 
+static void add_f16_f32_f32_cuda(const half * x, const float * y, float * dst, const int k, cudaStream_t stream) {
+    const int num_blocks = (k + CUDA_ADD_BLOCK_SIZE - 1) / CUDA_ADD_BLOCK_SIZE;
+    add_f16_f32_f32<<<num_blocks, CUDA_ADD_BLOCK_SIZE, 0, stream>>>(x, y, dst, k);
+}
+
 static void mul_f32_cuda(const float * x, const float * y, float * dst, const int kx, const int ky, cudaStream_t stream) {
     const int num_blocks = (kx + CUDA_MUL_BLOCK_SIZE - 1) / CUDA_MUL_BLOCK_SIZE;
     mul_f32<<<num_blocks, CUDA_MUL_BLOCK_SIZE, 0, stream>>>(x, y, dst, kx, ky);
@@ -5491,40 +5628,54 @@ static void clamp_f32_cuda(const float * x, float * dst, const float min, const
 }
 
 template<typename T>
-static void rope_cuda(const T * x, T * dst, const int ncols, const int nrows, const int32_t * pos, const float freq_scale,
-                          const int p_delta_rows, const float theta_scale, cudaStream_t stream) {
+static void rope_cuda(
+    const T * x, T * dst, int ncols, int nrows, const int32_t * pos, float freq_scale, int p_delta_rows,
+    float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, cudaStream_t stream
+) {
     GGML_ASSERT(ncols % 2 == 0);
     const dim3 block_dims(1, CUDA_ROPE_BLOCK_SIZE, 1);
     const int num_blocks_x = (ncols + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);
     const dim3 block_nums(nrows, num_blocks_x, 1);
     if (pos == nullptr) {
-        rope<T, false><<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, pos, freq_scale, p_delta_rows, theta_scale);
+        rope<T, false><<<block_nums, block_dims, 0, stream>>>(
+            x, dst, ncols, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims
+        );
     } else {
-        rope<T, true><<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, pos, freq_scale, p_delta_rows, theta_scale);
+        rope<T, true><<<block_nums, block_dims, 0, stream>>>(
+            x, dst, ncols, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims
+        );
     }
 }
 
 template<typename T>
-static void rope_neox_cuda(const T * x, T * dst, const int ncols, const int nrows, const int32_t * pos, const float freq_scale,
-                          const int p_delta_rows, const float theta_scale, cudaStream_t stream) {
+static void rope_neox_cuda(
+    const T * x, T * dst, int ncols, int nrows, const int32_t * pos, float freq_scale, int p_delta_rows,
+    float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, cudaStream_t stream
+) {
     GGML_ASSERT(ncols % 2 == 0);
     const dim3 block_dims(1, CUDA_ROPE_BLOCK_SIZE, 1);
     const int num_blocks_x = (ncols + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);
     const dim3 block_nums(nrows, num_blocks_x, 1);
     if (pos == nullptr) {
-        rope_neox<T, false><<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, pos, freq_scale, p_delta_rows, theta_scale);
+        rope_neox<T, false><<<block_nums, block_dims, 0, stream>>>(
+            x, dst, ncols, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims
+        );
     } else {
-        rope_neox<T, true><<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, pos, freq_scale, p_delta_rows, theta_scale);
+        rope_neox<T, true><<<block_nums, block_dims, 0, stream>>>(
+            x, dst, ncols, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims
+        );
     }
 }
 
-static void rope_glm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const int32_t * pos, const float freq_scale,
-                              const int p_delta_rows, const float theta_scale, const int n_ctx, cudaStream_t stream) {
+static void rope_glm_f32_cuda(
+    const float * x, float * dst, int ncols, int nrows, const int32_t * pos, float freq_scale, int p_delta_rows,
+    float freq_base, int n_ctx, cudaStream_t stream
+) {
     GGML_ASSERT(ncols % 4 == 0);
     const dim3 block_dims(CUDA_ROPE_BLOCK_SIZE/4, 1, 1);
     const int num_blocks_x = (ncols + CUDA_ROPE_BLOCK_SIZE - 1) / CUDA_ROPE_BLOCK_SIZE;
     const dim3 block_nums(num_blocks_x, nrows, 1);
-    rope_glm_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, pos, freq_scale, p_delta_rows, theta_scale, n_ctx);
+    rope_glm_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, pos, freq_scale, p_delta_rows, freq_base, n_ctx);
 }
 
 static void alibi_f32_cuda(const float * x, float * dst, const int ncols, const int nrows,
@@ -5628,6 +5779,16 @@ static void * ggml_cuda_pool_malloc(size_t size, size_t * actual_size) {
     return ptr;
 }
 
+static void * ggml_cuda_pool_malloc_async(size_t size, size_t * actual_size, int id, cudaStream_t stream) {
+    if (g_cudaMemPools[id] == nullptr) {
+        return ggml_cuda_pool_malloc(size, actual_size);
+    }
+    void *ptr;
+    CUDA_CHECK(cudaMallocFromPoolAsync(&ptr, size, g_cudaMemPools[id], stream));
+    *actual_size = size;
+    return ptr;
+}
+
 static void ggml_cuda_pool_free(void * ptr, size_t size) {
     scoped_spin_lock lock(g_cuda_pool_lock);
     int id;
@@ -5646,6 +5807,13 @@ static void ggml_cuda_pool_free(void * ptr, size_t size) {
 }
 
 
+static void ggml_cuda_pool_free_async(void * ptr, size_t actual_size, int id, cudaStream_t stream) {
+    if (g_cudaMemPools[id] == nullptr) {
+        return ggml_cuda_pool_free(ptr, actual_size);
+    }
+    CUDA_CHECK(cudaFreeAsync(ptr, stream));
+}
+
 void ggml_init_cublas() {
     static bool initialized = false;
 
@@ -5661,11 +5829,21 @@ void ggml_init_cublas() {
         CUDA_CHECK(cudaGetDeviceCount(&g_device_count));
         GGML_ASSERT(g_device_count <= GGML_CUDA_MAX_DEVICES);
         int64_t total_vram = 0;
+#if defined(GGML_CUDA_FORCE_MMQ)
+        fprintf(stderr, "%s: GGML_CUDA_FORCE_MMQ:   yes\n", __func__);
+#else
+        fprintf(stderr, "%s: GGML_CUDA_FORCE_MMQ:   no\n", __func__);
+#endif
+#if defined(CUDA_USE_TENSOR_CORES)
+        fprintf(stderr, "%s: CUDA_USE_TENSOR_CORES: yes\n", __func__);
+#else
+        fprintf(stderr, "%s: CUDA_USE_TENSOR_CORES: no\n", __func__);
+#endif
         fprintf(stderr, "%s: found %d " GGML_CUDA_NAME " devices:\n", __func__, g_device_count);
-        for (int64_t id = 0; id < g_device_count; ++id) {
+        for (int id = 0; id < g_device_count; ++id) {
             cudaDeviceProp prop;
             CUDA_CHECK(cudaGetDeviceProperties(&prop, id));
-            fprintf(stderr, "  Device %ld: %s, compute capability %d.%d\n", id, prop.name, prop.major, prop.minor);
+            fprintf(stderr, "  Device %d: %s, compute capability %d.%d\n", id, prop.name, prop.major, prop.minor);
 
             g_tensor_split[id] = total_vram;
             total_vram += prop.totalGlobalMem;
@@ -5675,21 +5853,28 @@ void ggml_init_cublas() {
             g_compute_capabilities[id] = 100*prop.major + 10*prop.minor;
 #endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
         }
-        for (int64_t id = 0; id < g_device_count; ++id) {
+        for (int id = 0; id < g_device_count; ++id) {
             g_tensor_split[id] /= total_vram;
         }
 
-        for (int64_t id = 0; id < g_device_count; ++id) {
+        for (int id = 0; id < g_device_count; ++id) {
             CUDA_CHECK(ggml_cuda_set_device(id));
 
             // create cuda streams
-            for (int64_t is = 0; is < MAX_STREAMS; ++is) {
+            for (int is = 0; is < MAX_STREAMS; ++is) {
                 CUDA_CHECK(cudaStreamCreateWithFlags(&g_cudaStreams[id][is], cudaStreamNonBlocking));
             }
 
             // create cublas handle
             CUBLAS_CHECK(cublasCreate(&g_cublas_handles[id]));
             CUBLAS_CHECK(cublasSetMathMode(g_cublas_handles[id], CUBLAS_TF32_TENSOR_OP_MATH));
+
+            // configure memory pool
+            cudaError_t err = cudaDeviceGetMemPool(&g_cudaMemPools[id], id);
+            if (err == cudaSuccess) {
+                size_t treshold = UINT64_MAX;
+                CUDA_CHECK(cudaMemPoolSetAttribute(g_cudaMemPools[id], cudaMemPoolAttrReleaseThreshold, &treshold));
+            }
         }
 
         // configure logging to stdout
@@ -5907,7 +6092,10 @@ inline void ggml_cuda_op_add(
         add_f32_cuda(src0_dd, src1_dd, dst_dd, ggml_nelements(src0), ne10*ne11, main_stream);
     } else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) {
         add_f16_f32_f16_cuda((const half *) src0_dd, src1_dd, (half *) dst_dd, ggml_nelements(src0), main_stream);
+    } else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F32) {
+        add_f16_f32_f32_cuda((const half *) src0_dd, src1_dd, dst_dd, ggml_nelements(src0), main_stream);
     } else {
+        fprintf(stderr, "src0->type: %d  dst->type: %d\n", src0->type, dst->type);
         GGML_ASSERT(false);
     }
 
@@ -6252,16 +6440,15 @@ inline void ggml_cuda_op_mul_mat_cublas(
     const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
     const int64_t src1_padded_row_size, const cudaStream_t & stream) {
 
-    GGML_ASSERT(src0_dd_i != nullptr);
+    GGML_ASSERT(src0_dd_i  != nullptr);
     GGML_ASSERT(src1_ddf_i != nullptr);
-    GGML_ASSERT(dst_dd_i != nullptr);
-
+    GGML_ASSERT(dst_dd_i   != nullptr);
 
     const int64_t ne00 = src0->ne[0];
-
     const int64_t ne10 = src1->ne[0];
 
     const int64_t ne0 = dst->ne[0];
+
     const int64_t row_diff = row_high - row_low;
 
     int id;
@@ -6281,7 +6468,7 @@ inline void ggml_cuda_op_mul_mat_cublas(
             const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src0->type);
             GGML_ASSERT(to_fp16_cuda != nullptr);
             size_t ne = row_diff*ne00;
-            src0_as_f16 = (half *) ggml_cuda_pool_malloc(ne * sizeof(half), &src0_as);
+            src0_as_f16 = (half *) ggml_cuda_pool_malloc_async(ne * sizeof(half), &src0_as, id, stream);
             to_fp16_cuda(src0_dd_i, src0_as_f16, ne, stream);
         }
         const half * src0_ptr = src0->type == GGML_TYPE_F16 ? (const half *) src0_dd_i : src0_as_f16;
@@ -6292,13 +6479,12 @@ inline void ggml_cuda_op_mul_mat_cublas(
             const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src1->type);
             GGML_ASSERT(to_fp16_cuda != nullptr);
             size_t ne = src1_ncols*ne10;
-            src1_as_f16 = (half *) ggml_cuda_pool_malloc(ne * sizeof(half), &src1_as);
+            src1_as_f16 = (half *) ggml_cuda_pool_malloc_async(ne * sizeof(half), &src1_as, id, stream);
             to_fp16_cuda(src1_ddf_i, src1_as_f16, ne, stream);
         }
         const half * src1_ptr = src1->type == GGML_TYPE_F16 ? (const half *) src1_ddq_i : src1_as_f16;
-
-        size_t dst_as = 0;
-        half * dst_f16 = (half *) ggml_cuda_pool_malloc(row_diff*src1_ncols * sizeof(half), &dst_as);
+        size_t dst_f16_as = 0;
+        half * dst_f16 = (half *) ggml_cuda_pool_malloc_async(row_diff*src1_ncols * sizeof(half), &dst_f16_as, id, stream);
 
         const half alpha_f16 = 1.0f;
         const half beta_f16 = 0.0f;
@@ -6316,14 +6502,15 @@ inline void ggml_cuda_op_mul_mat_cublas(
         const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_F16);
         to_fp32_cuda(dst_f16, dst_dd_i, row_diff*src1_ncols, stream);
 
-        ggml_cuda_pool_free(dst_f16, dst_as);
-
-        if (src0_as != 0) {
-            ggml_cuda_pool_free(src0_as_f16, src0_as);
+        if (dst_f16_as != 0) {
+            ggml_cuda_pool_free_async(dst_f16, dst_f16_as, id, stream);
         }
 
+        if (src0_as != 0) {
+            ggml_cuda_pool_free_async(src0_as_f16, src0_as, id, stream);
+        }
         if (src1_as != 0) {
-            ggml_cuda_pool_free(src1_as_f16, src1_as);
+            ggml_cuda_pool_free_async(src1_as_f16, src1_as, id, stream);
         }
     }
     else {
@@ -6333,7 +6520,7 @@ inline void ggml_cuda_op_mul_mat_cublas(
         if (src0->type != GGML_TYPE_F32) {
             const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(src0->type);
             GGML_ASSERT(to_fp32_cuda != nullptr);
-            src0_ddq_as_f32 = (float *) ggml_cuda_pool_malloc(row_diff*ne00 * sizeof(float), &src0_as); // NOLINT
+            src0_ddq_as_f32 = (float *) ggml_cuda_pool_malloc_async(row_diff*ne00 * sizeof(float), &src0_as, id, stream); // NOLINT
             to_fp32_cuda(src0_dd_i, src0_ddq_as_f32, row_diff*ne00, stream);
         }
         const float * src0_ddf_i = src0->type == GGML_TYPE_F32 ? (const float *) src0_dd_i : src0_ddq_as_f32;
@@ -6346,11 +6533,11 @@ inline void ggml_cuda_op_mul_mat_cublas(
             cublasSgemm(g_cublas_handles[id], CUBLAS_OP_T, CUBLAS_OP_N,
                     row_diff, src1_ncols, ne10,
                     &alpha, src0_ddf_i, ne00,
-                            src1_ddf_i,  ne10,
+                            src1_ddf_i, ne10,
                     &beta,  dst_dd_i,   ldc));
 
         if (src0_as != 0) {
-            ggml_cuda_pool_free(src0_ddq_as_f32, src0_as);
+            ggml_cuda_pool_free_async(src0_ddq_as_f32, src0_as, id, stream);
         }
     }
 
@@ -6372,17 +6559,20 @@ inline void ggml_cuda_op_rope(
     const int64_t ne2 = dst->ne[2];
     const int64_t nrows = ggml_nrows(src0);
 
-    //const int n_past = ((int32_t *) dst->op_params)[0];
-    const int n_dims = ((int32_t *) dst->op_params)[1];
-    const int mode   = ((int32_t *) dst->op_params)[2];
-    const int n_ctx  = ((int32_t *) dst->op_params)[3];
+    //const int n_past      = ((int32_t *) dst->op_params)[0];
+    const int n_dims      = ((int32_t *) dst->op_params)[1];
+    const int mode        = ((int32_t *) dst->op_params)[2];
+    const int n_ctx       = ((int32_t *) dst->op_params)[3];
+    const int n_orig_ctx  = ((int32_t *) dst->op_params)[4];
+
     // RoPE alteration for extended context
-
-    float freq_base, freq_scale;
-    memcpy(&freq_base,  (int32_t *) dst->op_params + 4, sizeof(float));
-    memcpy(&freq_scale, (int32_t *) dst->op_params + 5, sizeof(float));
-
-    const float theta_scale = powf(freq_base, -2.0f/n_dims);
+    float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
+    memcpy(&freq_base,   (int32_t *) dst->op_params +  5, sizeof(float));
+    memcpy(&freq_scale,  (int32_t *) dst->op_params +  6, sizeof(float));
+    memcpy(&ext_factor,  (int32_t *) dst->op_params +  7, sizeof(float));
+    memcpy(&attn_factor, (int32_t *) dst->op_params +  8, sizeof(float));
+    memcpy(&beta_fast,   (int32_t *) dst->op_params +  9, sizeof(float));
+    memcpy(&beta_slow,   (int32_t *) dst->op_params + 10, sizeof(float));
 
     const int32_t * pos = nullptr;
     if ((mode & 1) == 0) {
@@ -6394,24 +6584,39 @@ inline void ggml_cuda_op_rope(
     const bool is_neox = mode & 2;
     const bool is_glm  = mode & 4;
 
+    rope_corr_dims corr_dims;
+    ggml_rope_yarn_corr_dims(n_dims, n_orig_ctx, freq_base, beta_fast, beta_slow, corr_dims.v);
+
     // compute
     if (is_glm) {
         GGML_ASSERT(false);
-        rope_glm_f32_cuda(src0_dd, dst_dd, ne00, nrows, pos, freq_scale, ne01, theta_scale, n_ctx, main_stream);
+        rope_glm_f32_cuda(src0_dd, dst_dd, ne00, nrows, pos, freq_scale, ne01, freq_base, n_ctx, main_stream);
     } else if (is_neox) {
         GGML_ASSERT(ne00 == n_dims && "ne00 != n_dims is not implemented for CUDA yet");
         if (src0->type == GGML_TYPE_F32) {
-            rope_neox_cuda((const float *)src0_dd, (float *)dst_dd, ne00, nrows, pos, freq_scale, ne01, theta_scale, main_stream);
+            rope_neox_cuda(
+                (const float *)src0_dd, (float *)dst_dd, ne00, nrows, pos, freq_scale, ne01, freq_base, ext_factor,
+                attn_factor, corr_dims, main_stream
+            );
         } else if (src0->type == GGML_TYPE_F16) {
-            rope_neox_cuda((const half *)src0_dd, (half *)dst_dd, ne00, nrows, pos, freq_scale, ne01, theta_scale, main_stream);
+            rope_neox_cuda(
+                (const half *)src0_dd, (half *)dst_dd, ne00, nrows, pos, freq_scale, ne01, freq_base, ext_factor,
+                attn_factor, corr_dims, main_stream
+            );
         } else {
             GGML_ASSERT(false);
         }
     } else {
         if (src0->type == GGML_TYPE_F32) {
-            rope_cuda((const float *)src0_dd, (float *)dst_dd, ne00, nrows, pos, freq_scale, ne01, theta_scale, main_stream);
+            rope_cuda(
+                (const float *)src0_dd, (float *)dst_dd, ne00, nrows, pos, freq_scale, ne01, freq_base, ext_factor,
+                attn_factor, corr_dims, main_stream
+            );
         } else if (src0->type == GGML_TYPE_F16) {
-            rope_cuda((const half *)src0_dd, (half *)dst_dd, ne00, nrows, pos, freq_scale, ne01, theta_scale, main_stream);
+            rope_cuda(
+                (const half *)src0_dd, (half *)dst_dd, ne00, nrows, pos, freq_scale, ne01, freq_base, ext_factor,
+                attn_factor, corr_dims, main_stream
+            );
         } else {
             GGML_ASSERT(false);
         }
@@ -6522,8 +6727,10 @@ inline void ggml_cuda_op_clamp(
     GGML_ASSERT(src0->type == GGML_TYPE_F32);
     GGML_ASSERT( dst->type == GGML_TYPE_F32);
 
-    const float min = ((float *) dst->op_params)[0];
-    const float max = ((float *) dst->op_params)[1];
+    float min;
+    float max;
+    memcpy(&min, dst->op_params, sizeof(float));
+    memcpy(&max, (float *) dst->op_params + 1, sizeof(float));
 
     clamp_f32_cuda(src0_dd, dst_dd, min, max, ggml_nelements(src0), main_stream);
     CUDA_CHECK(cudaGetLastError());
@@ -6753,21 +6960,22 @@ static void ggml_cuda_op_mul_mat(
             src0_dd[id] = (char *) src0_extra->data_device[id];
         } else {
             const size_t size_src0_ddq = split ? (row_high[id]-row_low[id])*ne00 * src0_ts/src0_bs : ggml_nbytes(src0);
-            src0_dd[id] = (char *) ggml_cuda_pool_malloc(ggml_nbytes(src0), &src0_as[id]);
+            src0_dd[id] = (char *) ggml_cuda_pool_malloc_async(ggml_nbytes(src0), &src0_as[id], id, stream);
         }
 
         if (src1_on_device && src1_is_contiguous) {
             src1_ddf[id] = (float *) src1_extra->data_device[id];
         } else {
-            src1_ddf[id] = (float *) ggml_cuda_pool_malloc(ggml_nbytes(src1), &src1_asf[id]);
+            src1_ddf[id] = (float *) ggml_cuda_pool_malloc_async(ggml_nbytes(src1), &src1_asf[id], id, stream);
         }
 
         if (convert_src1_to_q8_1) {
-            src1_ddq[id] = (char *) ggml_cuda_pool_malloc(nrows1*src1_padded_col_size*q8_1_ts/q8_1_bs, &src1_asq[id]);
+            const size_t size_dst_ddq = nrows1*src1_padded_col_size*q8_1_ts/q8_1_bs;
+            src1_ddq[id] = (char *) ggml_cuda_pool_malloc_async(size_dst_ddq, &src1_asq[id], id, stream);
 
             if (src1_on_device && src1_is_contiguous) {
                 quantize_row_q8_1_cuda(src1_ddf[id], src1_ddq[id], ne10, nrows1, src1_padded_col_size, stream);
-                CUDA_CHECK(cudaGetLastError());
+                // CUDA_CHECK(cudaGetLastError());
             }
         }
 
@@ -6775,7 +6983,7 @@ static void ggml_cuda_op_mul_mat(
             dst_dd[id] = (float *) dst_extra->data_device[id];
         } else {
             const size_t size_dst_ddf = split ? (row_high[id]-row_low[id])*ne1*sizeof(float) : ggml_nbytes(dst);
-            dst_dd[id] = (float *) ggml_cuda_pool_malloc(size_dst_ddf, &dst_as[id]);
+            dst_dd[id] = (float *) ggml_cuda_pool_malloc_async(size_dst_ddf, &dst_as[id], id,  stream);
         }
     }
 
@@ -6901,24 +7109,6 @@ static void ggml_cuda_op_mul_mat(
         }
     }
 
-    for (int64_t id = 0; id < g_device_count; ++id) {
-        CUDA_CHECK(ggml_cuda_set_device(id));
-
-        // free buffers again when done
-        if (src0_as[id] > 0) {
-            ggml_cuda_pool_free(src0_dd[id], src0_as[id]);
-        }
-        if (src1_asf[id] > 0) {
-            ggml_cuda_pool_free(src1_ddf[id], src1_asf[id]);
-        }
-        if (src1_asq[id] > 0) {
-            ggml_cuda_pool_free(src1_ddq[id], src1_asq[id]);
-        }
-        if (dst_as[id] > 0) {
-            ggml_cuda_pool_free(dst_dd[id], dst_as[id]);
-        }
-    }
-
     // main device waits for all other devices to be finished
     if (split && g_device_count > 1) {
         int64_t is_max = (ne11 + MUL_MAT_SRC1_COL_STRIDE - 1) / MUL_MAT_SRC1_COL_STRIDE;
@@ -6936,6 +7126,21 @@ static void ggml_cuda_op_mul_mat(
         CUDA_CHECK(ggml_cuda_set_device(g_main_device));
         CUDA_CHECK(cudaDeviceSynchronize());
     }
+
+    for (int64_t id = 0; id < g_device_count; ++id) {
+        if (src0_as[id] > 0) {
+            ggml_cuda_pool_free_async(src0_dd[id], src0_as[id], id, g_cudaStreams[id][0]);
+        }
+        if (src1_asf[id] > 0) {
+            ggml_cuda_pool_free_async(src1_ddf[id], src1_asf[id], id, g_cudaStreams[id][0]);
+        }
+        if (src1_asq[id] > 0) {
+            ggml_cuda_pool_free_async(src1_ddq[id], src1_asq[id], id, g_cudaStreams[id][0]);
+        }
+        if (dst_as[id] > 0) {
+            ggml_cuda_pool_free_async(dst_dd[id], dst_as[id], id, g_cudaStreams[id][0]);
+        }
+    }
 }
 
 static void ggml_cuda_repeat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@@ -7013,7 +7218,8 @@ static void ggml_cuda_mul_mat_vec_p021(const ggml_tensor * src0, const ggml_tens
 }
 
 static void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst){
-    GGML_ASSERT(!ggml_is_contiguous(src0) && ggml_is_contiguous(src1));
+    GGML_ASSERT(!ggml_is_transposed(src0));
+    GGML_ASSERT(!ggml_is_transposed(src1));
     GGML_ASSERT(!ggml_is_permuted(src0));
     GGML_ASSERT(src0->backend != GGML_BACKEND_GPU_SPLIT);
     GGML_ASSERT(src0->type == GGML_TYPE_F16);
@@ -7023,11 +7229,11 @@ static void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor
     const int64_t ne01 = src0->ne[1];
     const int64_t ne02 = src0->ne[2];
 
-    const int64_t ne12 = src1->ne[2];
-
     const int64_t nb01 = src0->nb[1];
     const int64_t nb02 = src0->nb[2];
 
+    const int64_t ne12 = src1->ne[2];
+
     CUDA_CHECK(ggml_cuda_set_device(g_main_device));
     cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
 
@@ -7046,27 +7252,222 @@ static void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor
     ggml_mul_mat_vec_nc_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, row_stride_x, ne02, ne12, channel_stride_x, main_stream);
 }
 
+__global__ void k_compute_batched_ptrs(
+        const half * src0_as_f16, const half * src1_as_f16, half * dst_f16,
+        const void ** ptrs_src, void ** ptrs_dst,
+        int ne12, int ne13,
+        int ne23,
+        int nb02, int nb03,
+        int nb12, int nb13,
+        int nb2, int nb3,
+        int r2, int r3) {
+    int i13 = blockIdx.x * blockDim.x + threadIdx.x;
+    int i12 = blockIdx.y * blockDim.y + threadIdx.y;
+
+    if (i13 >= ne13 || i12 >= ne12) {
+        return;
+    }
+
+    int i03 = i13 / r3;
+    int i02 = i12 / r2;
+
+    ptrs_src[0*ne23 + i12 + i13*ne12] = (const char *) src0_as_f16 + i02*nb02   + i03*nb03;
+    ptrs_src[1*ne23 + i12 + i13*ne12] = (const char *) src1_as_f16 + i12*nb12/2 + i13*nb13/2;
+    ptrs_dst[0*ne23 + i12 + i13*ne12] = (      char *)     dst_f16 + i12* nb2/2 + i13* nb3/2;
+}
+
+static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    GGML_ASSERT(!ggml_is_transposed(src0));
+    GGML_ASSERT(!ggml_is_transposed(src1));
+
+    GGML_ASSERT(src0->backend != GGML_BACKEND_GPU_SPLIT);
+    GGML_ASSERT(src0->type == GGML_TYPE_F16);
+    GGML_ASSERT(src1->type == GGML_TYPE_F32);
+
+    const int64_t ne00 = src0->ne[0]; GGML_UNUSED(ne00);
+    const int64_t ne01 = src0->ne[1];
+    const int64_t ne02 = src0->ne[2];
+    const int64_t ne03 = src0->ne[3];
+
+    const int64_t nb01 = src0->nb[1];
+    const int64_t nb02 = src0->nb[2]; GGML_UNUSED(nb02);
+    const int64_t nb03 = src0->nb[3]; GGML_UNUSED(nb03);
+
+    const int64_t ne10 = src1->ne[0];
+    const int64_t ne11 = src1->ne[1];
+    const int64_t ne12 = src1->ne[2];
+    const int64_t ne13 = src1->ne[3];
+
+    const int64_t nb11 = src1->nb[1];
+    const int64_t nb12 = src1->nb[2]; GGML_UNUSED(nb12);
+    const int64_t nb13 = src1->nb[3]; GGML_UNUSED(nb13);
+
+    const int64_t ne1 = ggml_nelements(src1);
+    const int64_t ne  = ggml_nelements(dst);
+
+    CUDA_CHECK(ggml_cuda_set_device(g_main_device));
+    cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
+
+    int id;
+    CUDA_CHECK(cudaGetDevice(&id));
+    CUBLAS_CHECK(cublasSetStream(g_cublas_handles[id], main_stream));
+
+    ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
+    void * src0_ddq = src0_extra->data_device[g_main_device];
+    half * src0_as_f16 = (half *) src0_ddq;
+
+    ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
+    float * src1_ddf = (float *) src1_extra->data_device[g_main_device];
+
+    ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
+    float * dst_ddf = (float *) dst_extra->data_device[g_main_device];
+
+    // convert src1 to fp16
+    const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src1->type);
+    GGML_ASSERT(to_fp16_cuda != nullptr);
+
+    size_t src1_as = 0;
+    half * src1_as_f16 = (half *) ggml_cuda_pool_malloc_async(ne1 * sizeof(half), &src1_as, id, main_stream);
+    to_fp16_cuda(src1_ddf, src1_as_f16, ne1, main_stream);
+
+    size_t dst_as = 0;
+    half * dst_f16 = (half *) ggml_cuda_pool_malloc_async(ne * sizeof(half), &dst_as, id, main_stream);
+
+    GGML_ASSERT(ne12 % ne02 == 0);
+    GGML_ASSERT(ne13 % ne03 == 0);
+
+    // broadcast factors
+    const int64_t r2 = ne12/ne02;
+    const int64_t r3 = ne13/ne03;
+
+    const half alpha_f16 = 1.0f;
+    const half beta_f16  = 0.0f;
+
+#if 0
+    // use cublasGemmEx
+    {
+        for (int i13 = 0; i13 < ne13; ++i13) {
+            for (int i12 = 0; i12 < ne12; ++i12) {
+                int i03 = i13 / r3;
+                int i02 = i12 / r2;
+
+                CUBLAS_CHECK(
+                        cublasGemmEx(g_cublas_handles[id], CUBLAS_OP_T, CUBLAS_OP_N,
+                            ne01, ne11, ne10,
+                            &alpha_f16, (const char *) src0_as_f16 + i02*src0->nb[2]   + i03*src0->nb[3]  , CUDA_R_16F, nb01/sizeof(half),
+                                        (const char *) src1_as_f16 + i12*src1->nb[2]/2 + i13*src1->nb[3]/2, CUDA_R_16F, nb11/sizeof(float),
+                            &beta_f16,  (      char *)     dst_f16 + i12* dst->nb[2]/2 + i13* dst->nb[3]/2, CUDA_R_16F, ne01,
+                            CUBLAS_COMPUTE_16F,
+                            CUBLAS_GEMM_DEFAULT_TENSOR_OP));
+            }
+        }
+    }
+#else
+    if (r2 == 1 && r3 == 1 && src0->nb[2]*src0->ne[2] == src0->nb[3] && src1->nb[2]*src1->ne[2] == src1->nb[3]) {
+        // there is no broadcast and src0, src1 are contiguous across dims 2, 3
+        // use cublasGemmStridedBatchedEx
+        CUBLAS_CHECK(
+        cublasGemmStridedBatchedEx(g_cublas_handles[id], CUBLAS_OP_T, CUBLAS_OP_N,
+                ne01, ne11, ne10,
+                &alpha_f16, (const char *) src0_as_f16, CUDA_R_16F, nb01/sizeof(half),  src0->nb[2]/sizeof(half),  // strideA
+                            (const char *) src1_as_f16, CUDA_R_16F, nb11/sizeof(float), src1->nb[2]/sizeof(float), // strideB
+                &beta_f16,  (      char *)     dst_f16, CUDA_R_16F, ne01,                dst->nb[2]/sizeof(float), // strideC
+                ne12*ne13,
+                CUBLAS_COMPUTE_16F,
+                CUBLAS_GEMM_DEFAULT_TENSOR_OP));
+    } else {
+        // use cublasGemmBatchedEx
+        const int ne23 = ne12*ne13;
+
+        const void ** ptrs_src = nullptr;
+              void ** ptrs_dst = nullptr;
+
+        size_t ptrs_src_s = 0;
+        size_t ptrs_dst_s = 0;
+
+        ptrs_src = (const void **) ggml_cuda_pool_malloc_async(2*ne23*sizeof(void *), &ptrs_src_s, id, main_stream);
+        ptrs_dst = (      void **) ggml_cuda_pool_malloc_async(1*ne23*sizeof(void *), &ptrs_dst_s, id, main_stream);
+
+        dim3 block_dims(ne13, ne12);
+        k_compute_batched_ptrs<<<1, block_dims, 0, main_stream>>>(
+                src0_as_f16, src1_as_f16, dst_f16,
+                ptrs_src, ptrs_dst,
+                ne12, ne13,
+                ne23,
+                nb02, nb03,
+                nb12, nb13,
+                dst->nb[2], dst->nb[3],
+                r2, r3);
+        CUDA_CHECK(cudaGetLastError());
+        CUBLAS_CHECK(
+        cublasGemmBatchedEx(g_cublas_handles[id], CUBLAS_OP_T, CUBLAS_OP_N,
+                ne01, ne11, ne10,
+                &alpha_f16, (const void **) (ptrs_src + 0*ne23), CUDA_R_16F, nb01/sizeof(half),
+                            (const void **) (ptrs_src + 1*ne23), CUDA_R_16F, nb11/sizeof(float),
+                &beta_f16,  (      void **) (ptrs_dst + 0*ne23), CUDA_R_16F, ne01,
+                ne23,
+                CUBLAS_COMPUTE_16F,
+                CUBLAS_GEMM_DEFAULT_TENSOR_OP));
+
+        if (ptrs_src_s != 0) {
+            ggml_cuda_pool_free_async(ptrs_src, ptrs_src_s, id, main_stream);
+        }
+        if (ptrs_dst_s != 0) {
+            ggml_cuda_pool_free_async(ptrs_dst, ptrs_dst_s, id, main_stream);
+        }
+    }
+#endif
+
+    const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_F16);
+    to_fp32_cuda(dst_f16, dst_ddf, ne, main_stream);
+    if (src1_as != 0) {
+        ggml_cuda_pool_free_async(src1_as_f16, src1_as, id, main_stream);
+    }
+    if (dst_as != 0) {
+        ggml_cuda_pool_free_async(dst_f16, dst_as, id, main_stream);
+    }
+}
+
 static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    bool all_on_device = (src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT) &&
-        src1->backend == GGML_BACKEND_GPU && dst->backend == GGML_BACKEND_GPU;
+    const bool all_on_device =
+        (src0->backend == GGML_BACKEND_GPU) &&
+        (src1->backend == GGML_BACKEND_GPU) &&
+        ( dst->backend == GGML_BACKEND_GPU);
 
     int64_t min_compute_capability = INT_MAX;
     for (int64_t id = 0; id < g_device_count; ++id) {
-        if (min_compute_capability > g_compute_capabilities[id]
-                && g_tensor_split[id] < (id + 1 < g_device_count ? g_tensor_split[id + 1] : 1.0f)) {
+        if (min_compute_capability > g_compute_capabilities[id] && g_tensor_split[id] < (id + 1 < g_device_count ? g_tensor_split[id + 1] : 1.0f)) {
             min_compute_capability = g_compute_capabilities[id];
         }
     }
 
-    if (all_on_device && src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) {
+#ifdef CUDA_USE_TENSOR_CORES
+    const bool use_tensor_cores = true;
+#else
+    const bool use_tensor_cores = false;
+#endif
+
+    // debug helpers
+    //printf("src0: %8d %8d %8d %8d\n", src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3]);
+    //printf("      %8d %8d %8d %8d\n", src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3]);
+    //printf("src1: %8d %8d %8d %8d\n", src1->ne[0], src1->ne[1], src1->ne[2], src1->ne[3]);
+    //printf("      %8d %8d %8d %8d\n", src1->nb[0], src1->nb[1], src1->nb[2], src1->nb[3]);
+    //printf("src0 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src0), ggml_is_transposed(src0), ggml_type_name(src0->type), src0->name);
+    //printf("src1 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src1), ggml_is_transposed(src1), ggml_type_name(src1->type), src1->name);
+
+    if (all_on_device && !use_tensor_cores && src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) {
+        // KQ single-batch
         ggml_cuda_mul_mat_vec_p021(src0, src1, dst);
-    } else if (all_on_device && !ggml_is_contiguous(src0) && ggml_is_contiguous(src1) && src1->ne[1] == 1) {
+    } else if (all_on_device && !use_tensor_cores && src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && src1->ne[1] == 1) {
+        // KQV single-batch
         ggml_cuda_mul_mat_vec_nc(src0, src1, dst);
+    } else if (all_on_device && use_tensor_cores && src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32 && !ggml_is_transposed(src0) && !ggml_is_transposed(src1)) {
+        // KQ + KQV multi-batch
+        ggml_cuda_mul_mat_mat_batched_cublas(src0, src1, dst);
     } else if (src0->type == GGML_TYPE_F32) {
         ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, false);
     } else if (ggml_is_quantized(src0->type) || src0->type == GGML_TYPE_F16) {
         if (src1->ne[1] == 1 && src0->ne[0] % GGML_CUDA_DMMV_X == 0) {
-
 #ifdef GGML_CUDA_FORCE_DMMV
             const bool use_mul_mat_vec_q = false;
 #else
@@ -7079,7 +7480,15 @@ static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1
                 ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_dequantize_mul_mat_vec, false);
             }
         } else {
-            if (g_mul_mat_q && ggml_is_quantized(src0->type) && min_compute_capability >= MIN_CC_DP4A) {
+            bool use_mul_mat_q = min_compute_capability >= MIN_CC_DP4A && ggml_is_quantized(src0->type);
+
+            // when tensor cores are available, use them for large batch size
+            // ref: https://github.com/ggerganov/llama.cpp/pull/3776
+            if (use_tensor_cores && min_compute_capability >= CC_VOLTA && src1->ne[1] > MMQ_MAX_BATCH_SIZE) {
+                use_mul_mat_q = false;
+            }
+
+            if (use_mul_mat_q) {
                 ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_q, true);
             } else {
                 ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, false);
@@ -7433,10 +7842,6 @@ void ggml_cuda_set_main_device(const int main_device) {
     }
 }
 
-void ggml_cuda_set_mul_mat_q(const bool mul_mat_q) {
-    g_mul_mat_q = mul_mat_q;
-}
-
 void ggml_cuda_set_scratch_size(const size_t scratch_size) {
     // this is a hack to not completely break llama.cpp when using multiple models or contexts simultaneously
     // it still won't always work as expected, but it's better than nothing
diff --git a/ggml-impl.h b/ggml-impl.h
new file mode 100644
index 000000000..5ec18a50c
--- /dev/null
+++ b/ggml-impl.h
@@ -0,0 +1,237 @@
+#pragma once
+
+#include "ggml.h"
+
+// GGML internal header
+
+#include <assert.h>
+#include <stddef.h>
+#include <stdbool.h>
+#include <string.h> // memcpy
+#include <math.h>   // fabsf
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// static_assert should be a #define, but if it's not,
+// fall back to the _Static_assert C11 keyword.
+// if C99 - static_assert is noop
+// ref: https://stackoverflow.com/a/53923785/4039976
+#ifndef static_assert
+#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201100L)
+#define static_assert(cond, msg) _Static_assert(cond, msg)
+#else
+#define static_assert(cond, msg) struct global_scope_noop_trick
+#endif
+#endif
+
+// __FMA__ and __F16C__ are not defined in MSVC, however they are implied with AVX2/AVX512
+#if defined(_MSC_VER) && (defined(__AVX2__) || defined(__AVX512F__))
+#ifndef __FMA__
+#define __FMA__
+#endif
+#ifndef __F16C__
+#define __F16C__
+#endif
+#ifndef __SSE3__
+#define __SSE3__
+#endif
+#endif
+
+#undef MIN
+#undef MAX
+
+#define MIN(a, b) ((a) < (b) ? (a) : (b))
+#define MAX(a, b) ((a) > (b) ? (a) : (b))
+
+// 16-bit float
+// on Arm, we use __fp16
+// on x86, we use uint16_t
+#if defined(__ARM_NEON) && !defined(_MSC_VER)
+
+// if YCM cannot find <arm_neon.h>, make a symbolic link to it, for example:
+//
+//   $ ln -sfn /Library/Developer/CommandLineTools/usr/lib/clang/13.1.6/include/arm_neon.h ./src/
+//
+#include <arm_neon.h>
+
+#define GGML_COMPUTE_FP16_TO_FP32(x) ((float) (x))
+#define GGML_COMPUTE_FP32_TO_FP16(x) (x)
+
+#define GGML_FP16_TO_FP32(x) ((float) (x))
+#define GGML_FP32_TO_FP16(x) (x)
+
+#else
+
+#ifdef __wasm_simd128__
+#include <wasm_simd128.h>
+#else
+#ifdef __POWER9_VECTOR__
+#include <altivec.h>
+#undef bool
+#define bool _Bool
+#else
+#if defined(_MSC_VER) || defined(__MINGW32__)
+#include <intrin.h>
+#else
+#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__) || defined(__SSE3__)
+#if !defined(__riscv)
+#include <immintrin.h>
+#endif
+#endif
+#endif
+#endif
+#endif
+
+#ifdef __riscv_v_intrinsic
+#include <riscv_vector.h>
+#endif
+
+#ifdef __F16C__
+
+#ifdef _MSC_VER
+#define GGML_COMPUTE_FP16_TO_FP32(x) _mm_cvtss_f32(_mm_cvtph_ps(_mm_cvtsi32_si128(x)))
+#define GGML_COMPUTE_FP32_TO_FP16(x) _mm_extract_epi16(_mm_cvtps_ph(_mm_set_ss(x), 0), 0)
+#else
+#define GGML_COMPUTE_FP16_TO_FP32(x) _cvtsh_ss(x)
+#define GGML_COMPUTE_FP32_TO_FP16(x) _cvtss_sh(x, 0)
+#endif
+
+#elif defined(__POWER9_VECTOR__)
+
+#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
+#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
+/* the inline asm below is about 12% faster than the lookup method */
+#define GGML_FP16_TO_FP32(x) GGML_COMPUTE_FP16_TO_FP32(x)
+#define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x)
+
+static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
+    register float f;
+    register double d;
+    __asm__(
+        "mtfprd %0,%2\n"
+        "xscvhpdp %0,%0\n"
+        "frsp %1,%0\n" :
+        /* temp */ "=d"(d),
+        /* out */  "=f"(f):
+        /* in */   "r"(h));
+    return f;
+}
+
+static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
+    register double d;
+    register ggml_fp16_t r;
+    __asm__( /* xscvdphp can work on double or single precision */
+        "xscvdphp %0,%2\n"
+        "mffprd %1,%0\n" :
+        /* temp */ "=d"(d),
+        /* out */  "=r"(r):
+        /* in */   "f"(f));
+    return r;
+}
+
+#else
+
+// FP16 <-> FP32
+// ref: https://github.com/Maratyszcza/FP16
+
+static inline float fp32_from_bits(uint32_t w) {
+    union {
+        uint32_t as_bits;
+        float as_value;
+    } fp32;
+    fp32.as_bits = w;
+    return fp32.as_value;
+}
+
+static inline uint32_t fp32_to_bits(float f) {
+    union {
+        float as_value;
+        uint32_t as_bits;
+    } fp32;
+    fp32.as_value = f;
+    return fp32.as_bits;
+}
+
+static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
+    const uint32_t w = (uint32_t) h << 16;
+    const uint32_t sign = w & UINT32_C(0x80000000);
+    const uint32_t two_w = w + w;
+
+    const uint32_t exp_offset = UINT32_C(0xE0) << 23;
+#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__)
+    const float exp_scale = 0x1.0p-112f;
+#else
+    const float exp_scale = fp32_from_bits(UINT32_C(0x7800000));
+#endif
+    const float normalized_value = fp32_from_bits((two_w >> 4) + exp_offset) * exp_scale;
+
+    const uint32_t magic_mask = UINT32_C(126) << 23;
+    const float magic_bias = 0.5f;
+    const float denormalized_value = fp32_from_bits((two_w >> 17) | magic_mask) - magic_bias;
+
+    const uint32_t denormalized_cutoff = UINT32_C(1) << 27;
+    const uint32_t result = sign |
+        (two_w < denormalized_cutoff ? fp32_to_bits(denormalized_value) : fp32_to_bits(normalized_value));
+    return fp32_from_bits(result);
+}
+
+static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
+#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__)
+    const float scale_to_inf = 0x1.0p+112f;
+    const float scale_to_zero = 0x1.0p-110f;
+#else
+    const float scale_to_inf = fp32_from_bits(UINT32_C(0x77800000));
+    const float scale_to_zero = fp32_from_bits(UINT32_C(0x08800000));
+#endif
+    float base = (fabsf(f) * scale_to_inf) * scale_to_zero;
+
+    const uint32_t w = fp32_to_bits(f);
+    const uint32_t shl1_w = w + w;
+    const uint32_t sign = w & UINT32_C(0x80000000);
+    uint32_t bias = shl1_w & UINT32_C(0xFF000000);
+    if (bias < UINT32_C(0x71000000)) {
+        bias = UINT32_C(0x71000000);
+    }
+
+    base = fp32_from_bits((bias >> 1) + UINT32_C(0x07800000)) + base;
+    const uint32_t bits = fp32_to_bits(base);
+    const uint32_t exp_bits = (bits >> 13) & UINT32_C(0x00007C00);
+    const uint32_t mantissa_bits = bits & UINT32_C(0x00000FFF);
+    const uint32_t nonsign = exp_bits + mantissa_bits;
+    return (sign >> 16) | (shl1_w > UINT32_C(0xFF000000) ? UINT16_C(0x7E00) : nonsign);
+}
+
+#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
+#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
+
+#endif // __F16C__
+
+#endif // __ARM_NEON
+
+// precomputed f32 table for f16 (256 KB)
+// defined in ggml.c, initialized in ggml_init()
+extern float ggml_table_f32_f16[1 << 16];
+
+// On ARM NEON, it's quicker to directly convert x -> x instead of calling into ggml_lookup_fp16_to_fp32,
+// so we define GGML_FP16_TO_FP32 and GGML_FP32_TO_FP16 elsewhere for NEON.
+// This is also true for POWER9.
+#if !defined(GGML_FP16_TO_FP32) || !defined(GGML_FP32_TO_FP16)
+
+inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) {
+    uint16_t s;
+    memcpy(&s, &f, sizeof(uint16_t));
+    return ggml_table_f32_f16[s];
+}
+
+#define GGML_FP16_TO_FP32(x) ggml_lookup_fp16_to_fp32(x)
+#define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x)
+
+#endif
+
+    // TODO: backend v2 PR
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/ggml-metal.m b/ggml-metal.m
index 87fa17216..b33a3cb8f 100644
--- a/ggml-metal.m
+++ b/ggml-metal.m
@@ -62,6 +62,7 @@ struct ggml_metal_context {
     GGML_METAL_DECL_KERNEL(mul);
     GGML_METAL_DECL_KERNEL(mul_row); // TODO: avoid this extra kernel, instead extend the "mul" kernel to support broadcast
     GGML_METAL_DECL_KERNEL(scale);
+    GGML_METAL_DECL_KERNEL(scale_4);
     GGML_METAL_DECL_KERNEL(silu);
     GGML_METAL_DECL_KERNEL(relu);
     GGML_METAL_DECL_KERNEL(gelu);
@@ -73,6 +74,8 @@ struct ggml_metal_context {
     GGML_METAL_DECL_KERNEL(get_rows_f16);
     GGML_METAL_DECL_KERNEL(get_rows_q4_0);
     GGML_METAL_DECL_KERNEL(get_rows_q4_1);
+    GGML_METAL_DECL_KERNEL(get_rows_q5_0);
+    GGML_METAL_DECL_KERNEL(get_rows_q5_1);
     GGML_METAL_DECL_KERNEL(get_rows_q8_0);
     GGML_METAL_DECL_KERNEL(get_rows_q2_K);
     GGML_METAL_DECL_KERNEL(get_rows_q3_K);
@@ -87,6 +90,8 @@ struct ggml_metal_context {
     GGML_METAL_DECL_KERNEL(mul_mv_f16_f32_l4);
     GGML_METAL_DECL_KERNEL(mul_mv_q4_0_f32);
     GGML_METAL_DECL_KERNEL(mul_mv_q4_1_f32);
+    GGML_METAL_DECL_KERNEL(mul_mv_q5_0_f32);
+    GGML_METAL_DECL_KERNEL(mul_mv_q5_1_f32);
     GGML_METAL_DECL_KERNEL(mul_mv_q8_0_f32);
     GGML_METAL_DECL_KERNEL(mul_mv_q2_K_f32);
     GGML_METAL_DECL_KERNEL(mul_mv_q3_K_f32);
@@ -97,6 +102,8 @@ struct ggml_metal_context {
     GGML_METAL_DECL_KERNEL(mul_mm_f16_f32);
     GGML_METAL_DECL_KERNEL(mul_mm_q4_0_f32);
     GGML_METAL_DECL_KERNEL(mul_mm_q4_1_f32);
+    GGML_METAL_DECL_KERNEL(mul_mm_q5_0_f32);
+    GGML_METAL_DECL_KERNEL(mul_mm_q5_1_f32);
     GGML_METAL_DECL_KERNEL(mul_mm_q8_0_f32);
     GGML_METAL_DECL_KERNEL(mul_mm_q2_K_f32);
     GGML_METAL_DECL_KERNEL(mul_mm_q3_K_f32);
@@ -203,6 +210,10 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
             GGML_METAL_LOG_INFO("%s: default.metallib not found, loading from source\n", __func__);
 
             NSString * sourcePath = [bundle pathForResource:@"ggml-metal" ofType:@"metal"];
+            if (sourcePath == nil) {
+                GGML_METAL_LOG_WARN("%s: error: could not use bundle path to find ggml-metal.metal, falling back to trying cwd\n", __func__);
+                sourcePath = @"ggml-metal.metal";
+            }
             GGML_METAL_LOG_INFO("%s: loading '%s'\n", __func__, [sourcePath UTF8String]);
             NSString * src = [NSString stringWithContentsOfFile:sourcePath encoding:NSUTF8StringEncoding error:&error];
             if (error) {
@@ -227,14 +238,17 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
     // load kernels
     {
         NSError * error = nil;
-#define GGML_METAL_ADD_KERNEL(name) \
-        ctx->function_##name = [ctx->library newFunctionWithName:@"kernel_"#name]; \
-        ctx->pipeline_##name = [ctx->device newComputePipelineStateWithFunction:ctx->function_##name error:&error]; \
+
+        /*
         GGML_METAL_LOG_INFO("%s: loaded %-32s %16p | th_max = %4d | th_width = %4d\n", __func__, "kernel_"#name, (void *) ctx->pipeline_##name, \
                 (int) ctx->pipeline_##name.maxTotalThreadsPerThreadgroup, \
                 (int) ctx->pipeline_##name.threadExecutionWidth); \
+        */
+#define GGML_METAL_ADD_KERNEL(name) \
+        ctx->function_##name = [ctx->library newFunctionWithName:@"kernel_"#name]; \
+        ctx->pipeline_##name = [ctx->device newComputePipelineStateWithFunction:ctx->function_##name error:&error]; \
         if (error) { \
-          GGML_METAL_LOG_ERROR("%s: error: load pipeline error: %s\n", __func__, [[error description] UTF8String]); \
+            GGML_METAL_LOG_ERROR("%s: error: load pipeline error: %s\n", __func__, [[error description] UTF8String]); \
             return NULL; \
         }
 
@@ -243,6 +257,7 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
         GGML_METAL_ADD_KERNEL(mul);
         GGML_METAL_ADD_KERNEL(mul_row);
         GGML_METAL_ADD_KERNEL(scale);
+        GGML_METAL_ADD_KERNEL(scale_4);
         GGML_METAL_ADD_KERNEL(silu);
         GGML_METAL_ADD_KERNEL(relu);
         GGML_METAL_ADD_KERNEL(gelu);
@@ -254,6 +269,8 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
         GGML_METAL_ADD_KERNEL(get_rows_f16);
         GGML_METAL_ADD_KERNEL(get_rows_q4_0);
         GGML_METAL_ADD_KERNEL(get_rows_q4_1);
+        GGML_METAL_ADD_KERNEL(get_rows_q5_0);
+        GGML_METAL_ADD_KERNEL(get_rows_q5_1);
         GGML_METAL_ADD_KERNEL(get_rows_q8_0);
         GGML_METAL_ADD_KERNEL(get_rows_q2_K);
         GGML_METAL_ADD_KERNEL(get_rows_q3_K);
@@ -268,6 +285,8 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
         GGML_METAL_ADD_KERNEL(mul_mv_f16_f32_l4);
         GGML_METAL_ADD_KERNEL(mul_mv_q4_0_f32);
         GGML_METAL_ADD_KERNEL(mul_mv_q4_1_f32);
+        GGML_METAL_ADD_KERNEL(mul_mv_q5_0_f32);
+        GGML_METAL_ADD_KERNEL(mul_mv_q5_1_f32);
         GGML_METAL_ADD_KERNEL(mul_mv_q8_0_f32);
         GGML_METAL_ADD_KERNEL(mul_mv_q2_K_f32);
         GGML_METAL_ADD_KERNEL(mul_mv_q3_K_f32);
@@ -278,8 +297,10 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
             GGML_METAL_ADD_KERNEL(mul_mm_f32_f32);
             GGML_METAL_ADD_KERNEL(mul_mm_f16_f32);
             GGML_METAL_ADD_KERNEL(mul_mm_q4_0_f32);
-            GGML_METAL_ADD_KERNEL(mul_mm_q8_0_f32);
             GGML_METAL_ADD_KERNEL(mul_mm_q4_1_f32);
+            GGML_METAL_ADD_KERNEL(mul_mm_q5_0_f32);
+            GGML_METAL_ADD_KERNEL(mul_mm_q5_1_f32);
+            GGML_METAL_ADD_KERNEL(mul_mm_q8_0_f32);
             GGML_METAL_ADD_KERNEL(mul_mm_q2_K_f32);
             GGML_METAL_ADD_KERNEL(mul_mm_q3_K_f32);
             GGML_METAL_ADD_KERNEL(mul_mm_q4_K_f32);
@@ -335,6 +356,7 @@ void ggml_metal_free(struct ggml_metal_context * ctx) {
     GGML_METAL_DEL_KERNEL(mul);
     GGML_METAL_DEL_KERNEL(mul_row);
     GGML_METAL_DEL_KERNEL(scale);
+    GGML_METAL_DEL_KERNEL(scale_4);
     GGML_METAL_DEL_KERNEL(silu);
     GGML_METAL_DEL_KERNEL(relu);
     GGML_METAL_DEL_KERNEL(gelu);
@@ -346,6 +368,8 @@ void ggml_metal_free(struct ggml_metal_context * ctx) {
     GGML_METAL_DEL_KERNEL(get_rows_f16);
     GGML_METAL_DEL_KERNEL(get_rows_q4_0);
     GGML_METAL_DEL_KERNEL(get_rows_q4_1);
+    GGML_METAL_DEL_KERNEL(get_rows_q5_0);
+    GGML_METAL_DEL_KERNEL(get_rows_q5_1);
     GGML_METAL_DEL_KERNEL(get_rows_q8_0);
     GGML_METAL_DEL_KERNEL(get_rows_q2_K);
     GGML_METAL_DEL_KERNEL(get_rows_q3_K);
@@ -360,6 +384,8 @@ void ggml_metal_free(struct ggml_metal_context * ctx) {
     GGML_METAL_DEL_KERNEL(mul_mv_f16_f32_l4);
     GGML_METAL_DEL_KERNEL(mul_mv_q4_0_f32);
     GGML_METAL_DEL_KERNEL(mul_mv_q4_1_f32);
+    GGML_METAL_DEL_KERNEL(mul_mv_q5_0_f32);
+    GGML_METAL_DEL_KERNEL(mul_mv_q5_1_f32);
     GGML_METAL_DEL_KERNEL(mul_mv_q8_0_f32);
     GGML_METAL_DEL_KERNEL(mul_mv_q2_K_f32);
     GGML_METAL_DEL_KERNEL(mul_mv_q3_K_f32);
@@ -370,8 +396,10 @@ void ggml_metal_free(struct ggml_metal_context * ctx) {
         GGML_METAL_DEL_KERNEL(mul_mm_f32_f32);
         GGML_METAL_DEL_KERNEL(mul_mm_f16_f32);
         GGML_METAL_DEL_KERNEL(mul_mm_q4_0_f32);
-        GGML_METAL_DEL_KERNEL(mul_mm_q8_0_f32);
         GGML_METAL_DEL_KERNEL(mul_mm_q4_1_f32);
+        GGML_METAL_DEL_KERNEL(mul_mm_q5_0_f32);
+        GGML_METAL_DEL_KERNEL(mul_mm_q5_1_f32);
+        GGML_METAL_DEL_KERNEL(mul_mm_q8_0_f32);
         GGML_METAL_DEL_KERNEL(mul_mm_q2_K_f32);
         GGML_METAL_DEL_KERNEL(mul_mm_q3_K_f32);
         GGML_METAL_DEL_KERNEL(mul_mm_q4_K_f32);
@@ -905,15 +933,20 @@ void ggml_metal_graph_compute(
 
                             const float scale = *(const float *) src1->data;
 
-                            [encoder setComputePipelineState:ctx->pipeline_scale];
+                            int64_t n = ggml_nelements(dst);
+
+                            if (n % 4 == 0) {
+                                n /= 4;
+                                [encoder setComputePipelineState:ctx->pipeline_scale_4];
+                            } else {
+                                [encoder setComputePipelineState:ctx->pipeline_scale];
+                            }
+
                             [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
                             [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
                             [encoder setBytes:&scale length:sizeof(scale) atIndex:2];
 
-                            const int64_t n = ggml_nelements(dst);
-                            GGML_ASSERT(n % 4 == 0);
-
-                            [encoder dispatchThreadgroups:MTLSizeMake(n/4, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
+                            [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
                         } break;
                     case GGML_OP_UNARY:
                         switch (ggml_get_unary_op(gf->nodes[i])) {
@@ -968,11 +1001,15 @@ void ggml_metal_graph_compute(
                         } break;
                     case GGML_OP_SOFT_MAX:
                         {
-                            const int nth = MIN(32, ne00);
+                            int nth = 32; // SIMD width
 
                             if (ne00%4 == 0) {
                                 [encoder setComputePipelineState:ctx->pipeline_soft_max_4];
                             } else {
+                                do {
+                                    nth *= 2;
+                                } while (nth <= ne00 && nth <= 1024);
+                                nth /= 2;
                                 [encoder setComputePipelineState:ctx->pipeline_soft_max];
                             }
                             [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
@@ -980,8 +1017,9 @@ void ggml_metal_graph_compute(
                             [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:2];
                             [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:3];
                             [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:4];
+                            [encoder setThreadgroupMemoryLength:nth/32*sizeof(float) atIndex:0];
 
-                            [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
+                            [encoder dispatchThreadgroups:MTLSizeMake(ne01*ne02*ne03, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
                         } break;
                     case GGML_OP_DIAG_MASK_INF:
                         {
@@ -1052,6 +1090,8 @@ void ggml_metal_graph_compute(
                                     case GGML_TYPE_F16:  [encoder setComputePipelineState:ctx->pipeline_mul_mm_f16_f32];  break;
                                     case GGML_TYPE_Q4_0: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q4_0_f32]; break;
                                     case GGML_TYPE_Q4_1: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q4_1_f32]; break;
+                                    case GGML_TYPE_Q5_0: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q5_0_f32]; break;
+                                    case GGML_TYPE_Q5_1: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q5_1_f32]; break;
                                     case GGML_TYPE_Q8_0: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q8_0_f32]; break;
                                     case GGML_TYPE_Q2_K: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q2_K_f32]; break;
                                     case GGML_TYPE_Q3_K: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q3_K_f32]; break;
@@ -1121,6 +1161,24 @@ void ggml_metal_graph_compute(
                                             nth1 = 8;
                                             [encoder setComputePipelineState:ctx->pipeline_mul_mv_q4_1_f32];
                                         } break;
+                                    case GGML_TYPE_Q5_0:
+                                        {
+                                            GGML_ASSERT(ne02 == 1);
+                                            GGML_ASSERT(ne12 == 1);
+
+                                            nth0 = 8;
+                                            nth1 = 8;
+                                            [encoder setComputePipelineState:ctx->pipeline_mul_mv_q5_0_f32];
+                                        } break;
+                                    case GGML_TYPE_Q5_1:
+                                        {
+                                            GGML_ASSERT(ne02 == 1);
+                                            GGML_ASSERT(ne12 == 1);
+
+                                            nth0 = 8;
+                                            nth1 = 8;
+                                            [encoder setComputePipelineState:ctx->pipeline_mul_mv_q5_1_f32];
+                                        } break;
                                     case GGML_TYPE_Q8_0:
                                         {
                                             GGML_ASSERT(ne02 == 1);
@@ -1201,7 +1259,8 @@ void ggml_metal_graph_compute(
                                 [encoder setBytes:&ne1  length:sizeof(ne1)  atIndex:16];
                                 [encoder setBytes:&gqa  length:sizeof(gqa)  atIndex:17];
 
-                                if (src0t == GGML_TYPE_Q4_0 || src0t == GGML_TYPE_Q4_1 || src0t == GGML_TYPE_Q8_0 ||
+                                if (src0t == GGML_TYPE_Q4_0 || src0t == GGML_TYPE_Q4_1 ||
+                                    src0t == GGML_TYPE_Q5_0 || src0t == GGML_TYPE_Q5_1 || src0t == GGML_TYPE_Q8_0 ||
                                     src0t == GGML_TYPE_Q2_K) { // || src0t == GGML_TYPE_Q4_K) {
                                     [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7)/8, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
                                 }
@@ -1233,6 +1292,8 @@ void ggml_metal_graph_compute(
                                 case GGML_TYPE_F16:  [encoder setComputePipelineState:ctx->pipeline_get_rows_f16];  break;
                                 case GGML_TYPE_Q4_0: [encoder setComputePipelineState:ctx->pipeline_get_rows_q4_0]; break;
                                 case GGML_TYPE_Q4_1: [encoder setComputePipelineState:ctx->pipeline_get_rows_q4_1]; break;
+                                case GGML_TYPE_Q5_0: [encoder setComputePipelineState:ctx->pipeline_get_rows_q5_0]; break;
+                                case GGML_TYPE_Q5_1: [encoder setComputePipelineState:ctx->pipeline_get_rows_q5_1]; break;
                                 case GGML_TYPE_Q8_0: [encoder setComputePipelineState:ctx->pipeline_get_rows_q8_0]; break;
                                 case GGML_TYPE_Q2_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q2_K]; break;
                                 case GGML_TYPE_Q3_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q3_K]; break;
@@ -1339,14 +1400,18 @@ void ggml_metal_graph_compute(
 
                             const int nth = MIN(1024, ne00);
 
-                            const int n_past = ((int32_t *) dst->op_params)[0];
-                            const int n_dims = ((int32_t *) dst->op_params)[1];
-                            const int mode   = ((int32_t *) dst->op_params)[2];
+                            const int n_past     = ((int32_t *) dst->op_params)[0];
+                            const int n_dims     = ((int32_t *) dst->op_params)[1];
+                            const int mode       = ((int32_t *) dst->op_params)[2];
+                            const int n_orig_ctx = ((int32_t *) dst->op_params)[3];
 
-                            float freq_base;
-                            float freq_scale;
-                            memcpy(&freq_base,  (int32_t *) dst->op_params + 4, sizeof(float));
-                            memcpy(&freq_scale, (int32_t *) dst->op_params + 5, sizeof(float));
+                            float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
+                            memcpy(&freq_base,   (int32_t *) dst->op_params +  5, sizeof(float));
+                            memcpy(&freq_scale,  (int32_t *) dst->op_params +  6, sizeof(float));
+                            memcpy(&ext_factor,  (int32_t *) dst->op_params +  7, sizeof(float));
+                            memcpy(&attn_factor, (int32_t *) dst->op_params +  8, sizeof(float));
+                            memcpy(&beta_fast,   (int32_t *) dst->op_params +  9, sizeof(float));
+                            memcpy(&beta_slow,   (int32_t *) dst->op_params + 10, sizeof(float));
 
                             switch (src0->type) {
                                 case GGML_TYPE_F32: [encoder setComputePipelineState:ctx->pipeline_rope_f32]; break;
@@ -1354,30 +1419,35 @@ void ggml_metal_graph_compute(
                                 default: GGML_ASSERT(false);
                             };
 
-                            [encoder setBuffer:id_src0 offset:offs_src0        atIndex:0];
-                            [encoder setBuffer:id_src1 offset:offs_src1        atIndex:1];
-                            [encoder setBuffer:id_dst  offset:offs_dst         atIndex:2];
-                            [encoder setBytes:&ne00    length:sizeof( int64_t) atIndex:3];
-                            [encoder setBytes:&ne01    length:sizeof( int64_t) atIndex:4];
-                            [encoder setBytes:&ne02    length:sizeof( int64_t) atIndex:5];
-                            [encoder setBytes:&ne03    length:sizeof( int64_t) atIndex:6];
-                            [encoder setBytes:&nb00    length:sizeof(uint64_t) atIndex:7];
-                            [encoder setBytes:&nb01    length:sizeof(uint64_t) atIndex:8];
-                            [encoder setBytes:&nb02    length:sizeof(uint64_t) atIndex:9];
-                            [encoder setBytes:&nb03    length:sizeof(uint64_t) atIndex:10];
-                            [encoder setBytes:&ne0     length:sizeof( int64_t) atIndex:11];
-                            [encoder setBytes:&ne1     length:sizeof( int64_t) atIndex:12];
-                            [encoder setBytes:&ne2     length:sizeof( int64_t) atIndex:13];
-                            [encoder setBytes:&ne3     length:sizeof( int64_t) atIndex:14];
-                            [encoder setBytes:&nb0     length:sizeof(uint64_t) atIndex:15];
-                            [encoder setBytes:&nb1     length:sizeof(uint64_t) atIndex:16];
-                            [encoder setBytes:&nb2     length:sizeof(uint64_t) atIndex:17];
-                            [encoder setBytes:&nb3     length:sizeof(uint64_t) atIndex:18];
-                            [encoder setBytes:&n_past  length:sizeof(     int) atIndex:19];
-                            [encoder setBytes:&n_dims  length:sizeof(     int) atIndex:20];
-                            [encoder setBytes:&mode    length:sizeof(     int) atIndex:21];
-                            [encoder setBytes:&freq_base  length:sizeof(float) atIndex:22];
-                            [encoder setBytes:&freq_scale length:sizeof(float) atIndex:23];
+                            [encoder setBuffer:id_src0     offset:offs_src0        atIndex:0];
+                            [encoder setBuffer:id_src1     offset:offs_src1        atIndex:1];
+                            [encoder setBuffer:id_dst      offset:offs_dst         atIndex:2];
+                            [encoder setBytes:&ne00        length:sizeof( int64_t) atIndex:3];
+                            [encoder setBytes:&ne01        length:sizeof( int64_t) atIndex:4];
+                            [encoder setBytes:&ne02        length:sizeof( int64_t) atIndex:5];
+                            [encoder setBytes:&ne03        length:sizeof( int64_t) atIndex:6];
+                            [encoder setBytes:&nb00        length:sizeof(uint64_t) atIndex:7];
+                            [encoder setBytes:&nb01        length:sizeof(uint64_t) atIndex:8];
+                            [encoder setBytes:&nb02        length:sizeof(uint64_t) atIndex:9];
+                            [encoder setBytes:&nb03        length:sizeof(uint64_t) atIndex:10];
+                            [encoder setBytes:&ne0         length:sizeof( int64_t) atIndex:11];
+                            [encoder setBytes:&ne1         length:sizeof( int64_t) atIndex:12];
+                            [encoder setBytes:&ne2         length:sizeof( int64_t) atIndex:13];
+                            [encoder setBytes:&ne3         length:sizeof( int64_t) atIndex:14];
+                            [encoder setBytes:&nb0         length:sizeof(uint64_t) atIndex:15];
+                            [encoder setBytes:&nb1         length:sizeof(uint64_t) atIndex:16];
+                            [encoder setBytes:&nb2         length:sizeof(uint64_t) atIndex:17];
+                            [encoder setBytes:&nb3         length:sizeof(uint64_t) atIndex:18];
+                            [encoder setBytes:&n_past      length:sizeof(     int) atIndex:19];
+                            [encoder setBytes:&n_dims      length:sizeof(     int) atIndex:20];
+                            [encoder setBytes:&mode        length:sizeof(     int) atIndex:21];
+                            [encoder setBytes:&n_orig_ctx  length:sizeof(     int) atIndex:22];
+                            [encoder setBytes:&freq_base   length:sizeof(   float) atIndex:23];
+                            [encoder setBytes:&freq_scale  length:sizeof(   float) atIndex:24];
+                            [encoder setBytes:&ext_factor  length:sizeof(   float) atIndex:25];
+                            [encoder setBytes:&attn_factor length:sizeof(   float) atIndex:26];
+                            [encoder setBytes:&beta_fast   length:sizeof(   float) atIndex:27];
+                            [encoder setBytes:&beta_slow   length:sizeof(   float) atIndex:28];
 
                             [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
                         } break;
diff --git a/ggml-metal.metal b/ggml-metal.metal
index 99b9fd7a7..7c35f23a7 100644
--- a/ggml-metal.metal
+++ b/ggml-metal.metal
@@ -18,6 +18,21 @@ typedef struct {
     uint8_t qs[QK4_1 / 2];  // nibbles / quants
 } block_q4_1;
 
+#define QK5_0 32
+typedef struct {
+    half d;                // delta
+    uint8_t qh[4];         // 5-th bit of quants
+    uint8_t qs[QK5_0 / 2]; // nibbles / quants
+} block_q5_0;
+
+#define QK5_1 32
+typedef struct {
+    half d;                 // delta
+    half m;                 // min
+    uint8_t qh[4];          // 5-th bit of quants
+    uint8_t qs[QK5_1 / 2];  // nibbles / quants
+} block_q5_1;
+
 #define QK8_0 32
 typedef struct {
     half    d;         // delta
@@ -110,9 +125,17 @@ kernel void kernel_mul_row(
 }
 
 kernel void kernel_scale(
+        device const float * src0,
+        device       float * dst,
+        constant     float & scale,
+        uint tpig[[thread_position_in_grid]]) {
+    dst[tpig] = src0[tpig] * scale;
+}
+
+kernel void kernel_scale_4(
         device const float4 * src0,
         device       float4 * dst,
-        constant     float & scale,
+        constant     float  & scale,
         uint tpig[[thread_position_in_grid]]) {
     dst[tpig] = src0[tpig] * scale;
 }
@@ -161,36 +184,73 @@ kernel void kernel_soft_max(
         constant   int64_t & ne00,
         constant   int64_t & ne01,
         constant   int64_t & ne02,
-        uint3 tgpig[[threadgroup_position_in_grid]],
-        uint3 tpitg[[thread_position_in_threadgroup]],
-        uint3   ntg[[threads_per_threadgroup]]) {
-    const int64_t i03 = tgpig[2];
-    const int64_t i02 = tgpig[1];
-    const int64_t i01 = tgpig[0];
+        threadgroup float  * buf [[threadgroup(0)]],
+        uint  tgpig[[threadgroup_position_in_grid]],
+        uint  tpitg[[thread_position_in_threadgroup]],
+        uint  sgitg[[simdgroup_index_in_threadgroup]],
+        uint  tiisg[[thread_index_in_simdgroup]],
+        uint    ntg[[threads_per_threadgroup]]) {
+    const int64_t i03 = (tgpig) / (ne02*ne01);
+    const int64_t i02 = (tgpig - i03*ne02*ne01) / ne01;
+    const int64_t i01 = (tgpig - i03*ne02*ne01 - i02*ne01);
 
     device const float * psrc0 = src0 + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00;
     device       float * pdst  = dst  + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00;
 
     // parallel max
-    float lmax = tpitg[0] < ne00 ? psrc0[tpitg[0]] : -INFINITY;
-    for (int i00 = tpitg[0] + ntg[0]; i00 < ne00; i00 += ntg[0]) {
+    float lmax = tpitg < ne00 ? psrc0[tpitg] : -INFINITY;
+
+    for (int i00 = tpitg + ntg; i00 < ne00; i00 += ntg) {
         lmax = MAX(lmax, psrc0[i00]);
     }
-    const float max = simd_max(lmax);
+
+    float max = simd_max(lmax);
+    if (tiisg == 0) {
+        buf[sgitg] = max;
+    }
+
+    threadgroup_barrier(mem_flags::mem_threadgroup);
+
+    // broadcast, simd group number is ntg / 32
+    for (uint i = ntg / 32 / 2; i > 0; i /= 2) {
+       if (tpitg < i) {
+           buf[tpitg] = MAX(buf[tpitg], buf[tpitg + i]);
+       }
+    }
+
+    threadgroup_barrier(mem_flags::mem_threadgroup);
+
+    max = buf[0];
 
     // parallel sum
     float lsum = 0.0f;
-    for (int i00 = tpitg[0]; i00 < ne00; i00 += ntg[0]) {
+    for (int i00 = tpitg; i00 < ne00; i00 += ntg) {
         const float exp_psrc0 = exp(psrc0[i00] - max);
         lsum += exp_psrc0;
         // Remember the result of exp here. exp is expensive, so we really do not
-        // whish to compute it twice.
+        // wish to compute it twice.
         pdst[i00] = exp_psrc0;
     }
 
-    const float sum = simd_sum(lsum);
+    float sum = simd_sum(lsum);
+    if (tiisg == 0) {
+        buf[sgitg] = sum;
+    }
 
-    for (int i00 = tpitg[0]; i00 < ne00; i00 += ntg[0]) {
+    threadgroup_barrier(mem_flags::mem_threadgroup);
+
+    // broadcast, simd group number is ntg / 32
+    for (uint i = ntg / 32 / 2; i > 0; i /= 2) {
+       if (tpitg < i) {
+           buf[tpitg] += buf[tpitg + i];
+       }
+    }
+
+    threadgroup_barrier(mem_flags::mem_threadgroup);
+
+    sum = buf[0];
+
+    for (int i00 = tpitg; i00 < ne00; i00 += ntg) {
         pdst[i00] /= sum;
     }
 }
@@ -201,37 +261,73 @@ kernel void kernel_soft_max_4(
         constant   int64_t & ne00,
         constant   int64_t & ne01,
         constant   int64_t & ne02,
-        uint3 tgpig[[threadgroup_position_in_grid]],
-        uint3 tpitg[[thread_position_in_threadgroup]],
-        uint3   ntg[[threads_per_threadgroup]]) {
-    const int64_t i03 = tgpig[2];
-    const int64_t i02 = tgpig[1];
-    const int64_t i01 = tgpig[0];
+        threadgroup float  * buf [[threadgroup(0)]],
+        uint  tgpig[[threadgroup_position_in_grid]],
+        uint  tpitg[[thread_position_in_threadgroup]],
+        uint  sgitg[[simdgroup_index_in_threadgroup]],
+        uint  tiisg[[thread_index_in_simdgroup]],
+        uint    ntg[[threads_per_threadgroup]]) {
+    const int64_t i03 = (tgpig) / (ne02*ne01);
+    const int64_t i02 = (tgpig - i03*ne02*ne01) / ne01;
+    const int64_t i01 = (tgpig - i03*ne02*ne01 - i02*ne01);
 
     device const float4 * psrc4 = (device const float4 *)(src0 + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00);
     device       float4 * pdst4 = (device       float4 *)(dst  + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00);
 
     // parallel max
-    float4 lmax4 = tpitg[0] < ne00/4 ? psrc4[tpitg[0]] : -INFINITY;
-    for (int i00 = tpitg[0] + ntg[0]; i00 < ne00/4; i00 += ntg[0]) {
+    float4 lmax4 = tpitg < ne00/4 ? psrc4[tpitg] : -INFINITY;
+
+    for (int i00 = tpitg + ntg; i00 < ne00/4; i00 += ntg) {
         lmax4 = fmax(lmax4, psrc4[i00]);
     }
-    float lmax = MAX(MAX(lmax4[0], lmax4[1]), MAX(lmax4[2], lmax4[3]));
 
-    const float max = simd_max(lmax);
+    const float lmax = MAX(MAX(lmax4[0], lmax4[1]), MAX(lmax4[2], lmax4[3]));
+    float max = simd_max(lmax);
+    if (tiisg == 0) {
+        buf[sgitg] = max;
+    }
+
+    threadgroup_barrier(mem_flags::mem_threadgroup);
+
+    // broadcast, simd group number is ntg / 32
+    for (uint i = ntg / 32 / 2; i > 0; i /= 2) {
+       if (tpitg < i) {
+           buf[tpitg] = MAX(buf[tpitg], buf[tpitg + i]);
+       }
+    }
+
+    threadgroup_barrier(mem_flags::mem_threadgroup);
+
+    max = buf[0];
 
     // parallel sum
     float4 lsum4 = 0.0f;
-    for (int i00 = tpitg[0]; i00 < ne00/4; i00 += ntg[0]) {
+    for (int i00 = tpitg; i00 < ne00/4; i00 += ntg) {
         const float4 exp_psrc4 = exp(psrc4[i00] - max);
         lsum4 += exp_psrc4;
         pdst4[i00] = exp_psrc4;
     }
-    float lsum = lsum4[0] + lsum4[1] + lsum4[2] + lsum4[3];
 
-    const float sum = simd_sum(lsum);
+    const float lsum = lsum4[0] + lsum4[1] + lsum4[2] + lsum4[3];
+    float sum = simd_sum(lsum);
+    if (tiisg == 0) {
+        buf[sgitg] = sum;
+    }
 
-    for (int i00 = tpitg[0]; i00 < ne00/4; i00 += ntg[0]) {
+    threadgroup_barrier(mem_flags::mem_threadgroup);
+
+    // broadcast, simd group number is ntg / 32
+    for (uint i = ntg / 32 / 2; i > 0; i /= 2) {
+       if (tpitg < i) {
+           buf[tpitg] += buf[tpitg + i];
+       }
+    }
+
+    threadgroup_barrier(mem_flags::mem_threadgroup);
+
+    sum = buf[0];
+
+    for (int i00 = tpitg; i00 < ne00/4; i00 += ntg) {
         pdst4[i00] /= sum;
     }
 }
@@ -251,7 +347,7 @@ kernel void kernel_diag_mask_inf(
         dst[i02*ne01*ne00 + i01*ne00 + i00] = -INFINITY;
     } else {
         dst[i02*ne01*ne00 + i01*ne00 + i00] = src0[i02*ne01*ne00 + i01*ne00 + i00];
-     }
+    }
 }
 
 kernel void kernel_diag_mask_inf_8(
@@ -399,8 +495,11 @@ kernel void kernel_rms_norm(
 // that corresponds to the missing bit shifts (1, 1/16, 1/256, 1/4096)
 inline float block_q_n_dot_y(device const block_q4_0 * qb_curr, float sumy, thread float * yl, int il) {
     float d = qb_curr->d;
+
     float2 acc = 0.f;
+
     device const uint16_t * qs = ((device const uint16_t *)qb_curr + 1 + il/2);
+
     for (int i = 0; i < 8; i+=2) {
         acc[0] += yl[i + 0] * (qs[i / 2] & 0x000F)
                 + yl[i + 1] * (qs[i / 2] & 0x0F00);
@@ -417,8 +516,11 @@ inline float block_q_n_dot_y(device const block_q4_0 * qb_curr, float sumy, thre
 inline float block_q_n_dot_y(device const block_q4_1 * qb_curr, float sumy, thread float * yl, int il) {
     float d = qb_curr->d;
     float m = qb_curr->m;
-    device const uint16_t * qs = ((device const uint16_t *)qb_curr + 2 + il/2);
+
     float2 acc = 0.f;
+
+    device const uint16_t * qs = ((device const uint16_t *)qb_curr + 2 + il/2);
+
     for (int i = 0; i < 8; i+=2) {
         acc[0] += yl[i + 0] * (qs[i / 2] & 0x000F)
                 + yl[i + 1] * (qs[i / 2] & 0x0F00);
@@ -428,6 +530,49 @@ inline float block_q_n_dot_y(device const block_q4_1 * qb_curr, float sumy, thre
     return d * (acc[0] + acc[1]) + sumy * m;
 }
 
+// function for calculate inner product between half a q5_0 block and 16 floats (yl), sumy is SUM(yl[i])
+// il indicates where the q5 quants begin (0 or QK5_0/4)
+// we assume that the yl's have been multiplied with the appropriate scale factor
+// that corresponds to the missing bit shifts (1, 1/16, 1/256, 1/4096)
+inline float block_q_n_dot_y(device const block_q5_0 * qb_curr, float sumy, thread float * yl, int il) {
+    float d = qb_curr->d;
+
+    float2 acc = 0.f;
+
+    device const uint16_t * qs =  ((device const uint16_t *)qb_curr + 3 + il/2);
+           const uint32_t   qh = *((device const uint32_t *)qb_curr->qh);
+
+    for (int i = 0; i < 8; i+=2) {
+        acc[0] += yl[i + 0] * ((qs[i / 2] & 0x000F) | ((qh >> (i+0+il        ) << 4 ) & 0x00010))
+                + yl[i + 1] * ((qs[i / 2] & 0x0F00) | ((qh >> (i+1+il        ) << 12) & 0x01000));
+        acc[1] += yl[i + 8] * ((qs[i / 2] & 0x00F0) | ((qh >> (i+0+il+QK5_0/2) << 8 ) & 0x00100))
+                + yl[i + 9] * ((qs[i / 2] & 0xF000) | ((qh >> (i+1+il+QK5_0/2) << 16) & 0x10000));
+    }
+    return d * (sumy * -16.f + acc[0] + acc[1]);
+}
+
+// function for calculate inner product between half a q5_1 block and 16 floats (yl), sumy is SUM(yl[i])
+// il indicates where the q5 quants begin (0 or QK5_1/4)
+// we assume that the yl's have been multiplied with the appropriate scale factor
+// that corresponds to the missing bit shifts (1, 1/16, 1/256, 1/4096)
+inline float block_q_n_dot_y(device const block_q5_1 * qb_curr, float sumy, thread float * yl, int il) {
+    float d = qb_curr->d;
+    float m = qb_curr->m;
+
+    float2 acc = 0.f;
+
+    device const uint16_t * qs =  ((device const uint16_t *)qb_curr + 4 + il/2);
+           const uint32_t   qh = *((device const uint32_t *)qb_curr->qh);
+
+    for (int i = 0; i < 8; i+=2) {
+        acc[0] += yl[i + 0] * ((qs[i / 2] & 0x000F) | ((qh >> (i+0+il        ) << 4 ) & 0x00010))
+                + yl[i + 1] * ((qs[i / 2] & 0x0F00) | ((qh >> (i+1+il        ) << 12) & 0x01000));
+        acc[1] += yl[i + 8] * ((qs[i / 2] & 0x00F0) | ((qh >> (i+0+il+QK5_0/2) << 8 ) & 0x00100))
+                + yl[i + 9] * ((qs[i / 2] & 0xF000) | ((qh >> (i+1+il+QK5_0/2) << 16) & 0x10000));
+    }
+    return d * (acc[0] + acc[1]) + sumy * m;
+}
+
 // putting them in the kernel cause a significant performance penalty
 #define N_DST 4        // each SIMD group works on 4 rows
 #define N_SIMDGROUP 2  // number of SIMD groups in a thread group
@@ -525,6 +670,43 @@ kernel void kernel_mul_mv_q4_1_f32(
      mul_vec_q_n_f32<block_q4_1, N_DST, N_SIMDGROUP, N_SIMDWIDTH>(src0,src1,dst,ne00,ne01,ne02,ne10,ne12,ne0,ne1,gqa,tgpig,tiisg,sgitg);
 }
 
+kernel void kernel_mul_mv_q5_0_f32(
+        device const  void * src0,
+        device const float * src1,
+        device       float * dst,
+        constant   int64_t & ne00,
+        constant   int64_t & ne01[[buffer(4)]],
+        constant   int64_t & ne02[[buffer(5)]],
+        constant   int64_t & ne10[[buffer(9)]],
+        constant   int64_t & ne12[[buffer(11)]],
+        constant   int64_t & ne0[[buffer(15)]],
+        constant   int64_t & ne1[[buffer(16)]],
+        constant   uint    & gqa[[buffer(17)]],
+        uint3 tgpig[[threadgroup_position_in_grid]],
+        uint  tiisg[[thread_index_in_simdgroup]],
+        uint  sgitg[[simdgroup_index_in_threadgroup]]) {
+    mul_vec_q_n_f32<block_q5_0, N_DST, N_SIMDGROUP, N_SIMDWIDTH>(src0,src1,dst,ne00,ne01,ne02,ne10,ne12,ne0,ne1,gqa,tgpig,tiisg,sgitg);
+}
+
+kernel void kernel_mul_mv_q5_1_f32(
+        device const  void * src0,
+        device const float * src1,
+        device       float * dst,
+        constant   int64_t & ne00,
+        constant   int64_t & ne01[[buffer(4)]],
+        constant   int64_t & ne02[[buffer(5)]],
+        constant   int64_t & ne10[[buffer(9)]],
+        constant   int64_t & ne12[[buffer(11)]],
+        constant   int64_t & ne0[[buffer(15)]],
+        constant   int64_t & ne1[[buffer(16)]],
+        constant   uint    & gqa[[buffer(17)]],
+        uint3 tgpig[[threadgroup_position_in_grid]],
+        uint  tiisg[[thread_index_in_simdgroup]],
+        uint  sgitg[[simdgroup_index_in_threadgroup]]) {
+    mul_vec_q_n_f32<block_q5_1, N_DST, N_SIMDGROUP, N_SIMDWIDTH>(src0,src1,dst,ne00,ne01,ne02,ne10,ne12,ne0,ne1,gqa,tgpig,tiisg,sgitg);
+}
+
+
 #define NB_Q8_0 8
 
 kernel void kernel_mul_mv_q8_0_f32(
@@ -879,6 +1061,45 @@ kernel void kernel_alibi_f32(
     }
 }
 
+static float rope_yarn_ramp(const float low, const float high, const int i0) {
+    const float y = (i0 / 2 - low) / max(0.001f, high - low);
+    return 1.0f - min(1.0f, max(0.0f, y));
+}
+
+// YaRN algorithm based on LlamaYaRNScaledRotaryEmbedding.py from https://github.com/jquesnelle/yarn
+// MIT licensed. Copyright (c) 2023 Jeffrey Quesnelle and Bowen Peng.
+static void rope_yarn(
+    float theta_extrap, float freq_scale, float corr_dims[2], int64_t i0, float ext_factor, float mscale,
+    thread float * cos_theta, thread float * sin_theta
+) {
+    // Get n-d rotational scaling corrected for extrapolation
+    float theta_interp = freq_scale * theta_extrap;
+    float theta = theta_interp;
+    if (ext_factor != 0.0f) {
+        float ramp_mix = rope_yarn_ramp(corr_dims[0], corr_dims[1], i0) * ext_factor;
+        theta = theta_interp * (1 - ramp_mix) + theta_extrap * ramp_mix;
+
+        // Get n-d magnitude scaling corrected for interpolation
+        mscale *= 1.0f + 0.1f * log(1.0f / freq_scale);
+    }
+    *cos_theta = cos(theta) * mscale;
+    *sin_theta = sin(theta) * mscale;
+}
+
+// Apparently solving `n_rot = 2pi * x * base^((2 * max_pos_emb) / n_dims)` for x, we get
+// `corr_fac(n_rot) = n_dims * log(max_pos_emb / (n_rot * 2pi)) / (2 * log(base))`
+static float rope_yarn_corr_factor(int n_dims, int n_orig_ctx, float n_rot, float base) {
+    return n_dims * log(n_orig_ctx / (n_rot * 2 * M_PI_F)) / (2 * log(base));
+}
+
+static void rope_yarn_corr_dims(
+    int n_dims, int n_orig_ctx, float freq_base, float beta_fast, float beta_slow, float dims[2]
+) {
+    // start and end correction dims
+    dims[0] = max(0.0f,         floor(rope_yarn_corr_factor(n_dims, n_orig_ctx, beta_fast, freq_base)));
+    dims[1] = min(n_dims - 1.0f, ceil(rope_yarn_corr_factor(n_dims, n_orig_ctx, beta_slow, freq_base)));
+}
+
 typedef void (rope_t)(
         device const    void * src0,
         device const int32_t * src1,
@@ -902,8 +1123,13 @@ typedef void (rope_t)(
         constant         int & n_past,
         constant         int & n_dims,
         constant         int & mode,
+        constant         int & n_orig_ctx,
         constant       float & freq_base,
         constant       float & freq_scale,
+        constant       float & ext_factor,
+        constant       float & attn_factor,
+        constant       float & beta_fast,
+        constant       float & beta_slow,
         uint  tiitg[[thread_index_in_threadgroup]],
         uint3 tptg[[threads_per_threadgroup]],
         uint3 tgpig[[threadgroup_position_in_grid]]);
@@ -932,8 +1158,13 @@ kernel void kernel_rope(
         constant         int & n_past,
         constant         int & n_dims,
         constant         int & mode,
+        constant         int & n_orig_ctx,
         constant       float & freq_base,
         constant       float & freq_scale,
+        constant       float & ext_factor,
+        constant       float & attn_factor,
+        constant       float & beta_fast,
+        constant       float & beta_slow,
         uint  tiitg[[thread_index_in_threadgroup]],
         uint3 tptg[[threads_per_threadgroup]],
         uint3 tgpig[[threadgroup_position_in_grid]]) {
@@ -943,19 +1174,22 @@ kernel void kernel_rope(
 
     const bool is_neox = mode & 2;
 
+    float corr_dims[2];
+    rope_yarn_corr_dims(n_dims, n_orig_ctx, freq_base, beta_fast, beta_slow, corr_dims);
+
     device const int32_t * pos = src1;
 
     const int64_t p = pos[i2];
 
-    const float theta_0 = freq_scale * (float)p;
+    const float theta_0 = (float)p;
     const float inv_ndims = -1.f/n_dims;
 
     if (!is_neox) {
         for (int64_t i0 = 2*tiitg; i0 < ne0; i0 += 2*tptg.x) {
 
             const float theta = theta_0 * pow(freq_base, inv_ndims*i0);
-            const float cos_theta = cos(theta);
-            const float sin_theta = sin(theta);
+            float cos_theta, sin_theta;
+            rope_yarn(theta, freq_scale, corr_dims, i0, ext_factor, attn_factor, &cos_theta, &sin_theta);
 
             device const T * const src = (device T *)((device char *) src0 + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
             device       T * dst_data  = (device T *)((device char *)  dst + i3*nb3  + i2*nb2  + i1*nb1  + i0*nb0);
@@ -970,9 +1204,12 @@ kernel void kernel_rope(
         for (int64_t ib = 0; ib < ne0/n_dims; ++ib) {
             for (int64_t ic = 2*tiitg; ic < n_dims; ic += 2*tptg.x) {
 
-                const float theta = theta_0 * pow(freq_base, inv_ndims*ic - ib);
-                const float cos_theta = cos(theta);
-                const float sin_theta = sin(theta);
+                // simplified from `(ib * n_dims + ic) * inv_ndims`
+                const float cur_rot = inv_ndims*ic - ib;
+
+                const float theta = theta_0 * pow(freq_base, cur_rot);
+                float cos_theta, sin_theta;
+                rope_yarn(theta, freq_scale, corr_dims, cur_rot, ext_factor, attn_factor, &cos_theta, &sin_theta);
 
                 const int64_t i0 = ib*n_dims + ic/2;
 
@@ -2149,6 +2386,62 @@ void dequantize_q4_1(device const block_q4_1 *xb, short il, thread type4x4 & reg
     }
 }
 
+template <typename type4x4>
+void dequantize_q5_0(device const block_q5_0 *xb, short il, thread type4x4 & reg) {
+    device const uint16_t * qs = ((device const uint16_t *)xb + 3);
+    const float d = xb->d;
+    const float md = -16.h * xb->d;
+    const ushort mask = il ? 0x00F0 : 0x000F;
+
+    const uint32_t qh = *((device const uint32_t *)xb->qh);
+
+    const int x_mv = il ? 4 : 0;
+
+    const int gh_mv = il ? 12 : 0;
+    const int gh_bk = il ?  0 : 4;
+
+    for (int i = 0; i < 8; i++) {
+        // extract the 5-th bits for x0 and x1
+        const uint8_t xh_0 = ((qh >> (gh_mv + 2*i  )) << gh_bk) & 0x10;
+        const uint8_t xh_1 = ((qh >> (gh_mv + 2*i+1)) << gh_bk) & 0x10;
+
+        // combine the 4-bits from qs with the 5th bit
+        const int32_t x0 = ((((qs[i]     ) & mask) >> x_mv) | xh_0);
+        const int32_t x1 = ((((qs[i] >> 8) & mask) >> x_mv) | xh_1);
+
+        reg[i/2][2*(i%2)+0] = d * x0 + md;
+        reg[i/2][2*(i%2)+1] = d * x1 + md;
+    }
+}
+
+template <typename type4x4>
+void dequantize_q5_1(device const block_q5_1 *xb, short il, thread type4x4 & reg) {
+    device const uint16_t * qs = ((device const uint16_t *)xb + 4);
+    const float d = xb->d;
+    const float m = xb->m;
+    const ushort mask = il ? 0x00F0 : 0x000F;
+
+    const uint32_t qh = *((device const uint32_t *)xb->qh);
+
+    const int x_mv = il ? 4 : 0;
+
+    const int gh_mv = il ? 12 : 0;
+    const int gh_bk = il ?  0 : 4;
+
+    for (int i = 0; i < 8; i++) {
+        // extract the 5-th bits for x0 and x1
+        const uint8_t xh_0 = ((qh >> (gh_mv + 2*i  )) << gh_bk) & 0x10;
+        const uint8_t xh_1 = ((qh >> (gh_mv + 2*i+1)) << gh_bk) & 0x10;
+
+        // combine the 4-bits from qs with the 5th bit
+        const int32_t x0 = ((((qs[i]     ) & mask) >> x_mv) | xh_0);
+        const int32_t x1 = ((((qs[i] >> 8) & mask) >> x_mv) | xh_1);
+
+        reg[i/2][2*(i%2)+0] = d * x0 + m;
+        reg[i/2][2*(i%2)+1] = d * x1 + m;
+    }
+}
+
 template <typename type4x4>
 void dequantize_q8_0(device const block_q8_0 *xb, short il, thread type4x4 & reg) {
     device const int8_t * qs = ((device const int8_t *)xb->qs);
@@ -2490,6 +2783,8 @@ template [[host_name("kernel_get_rows_f32")]]  kernel get_rows_t kernel_get_rows
 template [[host_name("kernel_get_rows_f16")]]  kernel get_rows_t kernel_get_rows<half4x4,    1, dequantize_f16>;
 template [[host_name("kernel_get_rows_q4_0")]] kernel get_rows_t kernel_get_rows<block_q4_0, 2, dequantize_q4_0>;
 template [[host_name("kernel_get_rows_q4_1")]] kernel get_rows_t kernel_get_rows<block_q4_1, 2, dequantize_q4_1>;
+template [[host_name("kernel_get_rows_q5_0")]] kernel get_rows_t kernel_get_rows<block_q5_0, 2, dequantize_q5_0>;
+template [[host_name("kernel_get_rows_q5_1")]] kernel get_rows_t kernel_get_rows<block_q5_1, 2, dequantize_q5_1>;
 template [[host_name("kernel_get_rows_q8_0")]] kernel get_rows_t kernel_get_rows<block_q8_0, 2, dequantize_q8_0>;
 template [[host_name("kernel_get_rows_q2_K")]] kernel get_rows_t kernel_get_rows<block_q2_K, QK_NL, dequantize_q2_K>;
 template [[host_name("kernel_get_rows_q3_K")]] kernel get_rows_t kernel_get_rows<block_q3_K, QK_NL, dequantize_q3_K>;
@@ -2518,6 +2813,8 @@ template [[host_name("kernel_mul_mm_f32_f32")]]  kernel mat_mm_t kernel_mul_mm<f
 template [[host_name("kernel_mul_mm_f16_f32")]]  kernel mat_mm_t kernel_mul_mm<half4x4,    1,     dequantize_f16>;
 template [[host_name("kernel_mul_mm_q4_0_f32")]] kernel mat_mm_t kernel_mul_mm<block_q4_0, 2,     dequantize_q4_0>;
 template [[host_name("kernel_mul_mm_q4_1_f32")]] kernel mat_mm_t kernel_mul_mm<block_q4_1, 2,     dequantize_q4_1>;
+template [[host_name("kernel_mul_mm_q5_0_f32")]] kernel mat_mm_t kernel_mul_mm<block_q5_0, 2,     dequantize_q5_0>;
+template [[host_name("kernel_mul_mm_q5_1_f32")]] kernel mat_mm_t kernel_mul_mm<block_q5_1, 2,     dequantize_q5_1>;
 template [[host_name("kernel_mul_mm_q8_0_f32")]] kernel mat_mm_t kernel_mul_mm<block_q8_0, 2,     dequantize_q8_0>;
 template [[host_name("kernel_mul_mm_q2_K_f32")]] kernel mat_mm_t kernel_mul_mm<block_q2_K, QK_NL, dequantize_q2_K>;
 template [[host_name("kernel_mul_mm_q3_K_f32")]] kernel mat_mm_t kernel_mul_mm<block_q3_K, QK_NL, dequantize_q3_K>;
diff --git a/ggml-opencl.cpp b/ggml-opencl.cpp
index 4a331f24a..202bcb485 100644
--- a/ggml-opencl.cpp
+++ b/ggml-opencl.cpp
@@ -19,7 +19,7 @@
 #pragma warning(disable: 4244 4267) // possible loss of data
 #endif
 
-#define CL_DMMV_BLOCK_SIZE 32
+#define CL_DMMV_LOCAL_SIZE 32
 
 #ifndef K_QUANTS_PER_ITERATION
 #define K_QUANTS_PER_ITERATION 1
@@ -338,7 +338,7 @@ __kernel void dequantize_mul_mat_vec_q2_K(__global const struct block_q2_K * xx,
     const int row = get_group_id(0);
 
     const int num_blocks_per_row = ncols / QK_K;
-    const int ib0 = row*num_blocks_per_row;
+    const int ib0 = row*num_blocks_per_row + get_global_offset(0);
 
     __global const struct block_q2_K * x = xx + ib0;
 
@@ -413,7 +413,7 @@ __kernel void dequantize_mul_mat_vec_q3_K(__global const struct block_q3_K * xx,
     const int row = get_group_id(0);
 
     const int num_blocks_per_row = ncols / QK_K;
-    const int ib0 = row*num_blocks_per_row;
+    const int ib0 = row*num_blocks_per_row + get_global_offset(0);
 
     __global const struct block_q3_K * x = xx + ib0;
 
@@ -489,7 +489,7 @@ __kernel void dequantize_mul_mat_vec_q4_K(__global const struct block_q4_K * xx,
 
     const int row = get_group_id(0);
     const int num_blocks_per_row = ncols / QK_K;
-    const int ib0 = row*num_blocks_per_row;
+    const int ib0 = row*num_blocks_per_row + get_global_offset(0);
 
     const int tid = get_local_id(0)/K_QUANTS_PER_ITERATION;  // 0...15
     const int ix  = get_local_id(0)%K_QUANTS_PER_ITERATION;
@@ -562,7 +562,7 @@ __kernel void dequantize_mul_mat_vec_q5_K(__global const struct block_q5_K * xx,
 
     const int row = get_group_id(0);
     const int num_blocks_per_row = ncols / QK_K;
-    const int ib0 = row*num_blocks_per_row;
+    const int ib0 = row*num_blocks_per_row + get_global_offset(0);
 
     const int tid = get_local_id(0)/2;  // 0...15
     const int ix  = get_local_id(0)%2;
@@ -641,7 +641,7 @@ __kernel void dequantize_mul_mat_vec_q6_K(__global const struct block_q6_K * xx,
     const int row = get_group_id(0);
 
     const int num_blocks_per_row = ncols / QK_K;
-    const int ib0 = row*num_blocks_per_row;
+    const int ib0 = row*num_blocks_per_row + get_global_offset(0);
 
     __global const struct block_q6_K * x = xx + ib0;
 
@@ -745,19 +745,21 @@ __kernel void KERNEL_NAME(__global X_TYPE* x, __global float* y) {
 
 std::string dequant_mul_mat_vec_template = MULTILINE_QUOTE(
 __kernel void KERNEL_NAME(__global X_TYPE* x, __local float* tmp, __global float* y, __global float* dst, const int ncols) {
-    const int block_size = get_local_size(0);
+    const int local_size = get_local_size(0);
     const int row = get_group_id(0);
     const int tid = get_local_id(0);
 
     const uint qk = QUANT_K;
     const uint qr = QUANT_R;
 
+    const int col_step = local_size * 2;
     const int y_offset = qr == 1 ? 1 : qk/2;
 
+    x += get_global_offset(0);
+
     tmp[tid] = 0;
 
-    for (int i = 0; i < ncols/block_size; i += 2) {
-        const int col = i*block_size + 2*tid;
+    for (int col = tid*2; col < ncols; col += col_step) {
         const int ib = (row*ncols + col)/qk; // block index
         const int iqs = (col%qk)/qr; // quant index
         const int iybs = col - col%qk; // y block start index
@@ -773,7 +775,7 @@ __kernel void KERNEL_NAME(__global X_TYPE* x, __local float* tmp, __global float
 
     // sum up partial sums and write back result
     barrier(CLK_LOCAL_MEM_FENCE);
-    for (int s=block_size/2; s>0; s>>=1) {
+    for (int s=local_size/2; s>0; s>>=1) {
         if (tid < s) {
             tmp[tid] += tmp[tid + s];
         }
@@ -1393,75 +1395,46 @@ static void ggml_cl_mul_f32(const ggml_tensor * src0, const ggml_tensor * src1,
     const int64_t ne01 = src0->ne[1];
     const int64_t ne02 = src0->ne[2];
     const int64_t ne03 = src0->ne[3];
-    const int64_t ne0 = ne00 * ne01 * ne02 * ne03;
     const int64_t ne10 = src1->ne[0];
     const int64_t ne11 = src1->ne[1];
     const int64_t ne12 = src1->ne[2];
     const int64_t ne13 = src1->ne[3];
-    const int64_t nb10 = src1->nb[0];
     const int nb2  = dst->nb[2];
     const int nb3  = dst->nb[3];
     size_t x_size;
     size_t d_size;
 
-    cl_mem d_X = ggml_cl_pool_malloc(ne0 * sizeof(float), &x_size); // src0
+    cl_mem d_X = ggml_cl_pool_malloc(ne00 * ne01 * sizeof(float), &x_size); // src0
     cl_mem d_Y = (cl_mem) src1->extra; // src1 is already on device, broadcasted.
-    cl_mem d_D = ggml_cl_pool_malloc(ne0 * sizeof(float), &d_size); // dst
+    cl_mem d_D = ggml_cl_pool_malloc(ne00 * ne01 * sizeof(float), &d_size); // dst
 
 
     for (int64_t i03 = 0; i03 < ne03; i03++) {
         for (int64_t i02 = 0; i02 < ne02; i02++) {
-            const int i0 = i03*ne02 + i02;
-
             cl_event ev;
 
             // copy src0 to device
-            CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_X, i0, src0, i03, i02, &ev));
+            CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_X, 0, src0, i03, i02, &ev));
 
-            if (nb10 == sizeof(float)) {
-                // Contiguous, avoid overhead from queueing many kernel runs
-                const int64_t i13 = i03%ne13;
-                const int64_t i12 = i02%ne12;
-                const int i1 = i13*ne12*ne11 + i12*ne11;
+            const int64_t i13 = i03%ne13;
+            const int64_t i12 = i02%ne12;
+            const int i1 = i13*ne12*ne11 + i12*ne11;
 
-                cl_int x_offset = 0;
-                cl_int y_offset = i1*ne10;
-                cl_int d_offset = 0;
+            cl_int x_offset = 0;
+            cl_int y_offset = i1*ne10;
+            cl_int d_offset = 0;
 
-                size_t global = ne00 * ne01;
-                cl_int ky = ne10;
-                CL_CHECK(clSetKernelArg(mul_f32_cl, 0, sizeof(cl_mem), &d_X));
-                CL_CHECK(clSetKernelArg(mul_f32_cl, 1, sizeof(cl_int), &x_offset));
-                CL_CHECK(clSetKernelArg(mul_f32_cl, 2, sizeof(cl_mem), &d_Y));
-                CL_CHECK(clSetKernelArg(mul_f32_cl, 3, sizeof(cl_int), &y_offset));
-                CL_CHECK(clSetKernelArg(mul_f32_cl, 4, sizeof(cl_mem), &d_D));
-                CL_CHECK(clSetKernelArg(mul_f32_cl, 5, sizeof(cl_int), &d_offset));
-                CL_CHECK(clSetKernelArg(mul_f32_cl, 6, sizeof(cl_int), &ky));
-                CL_CHECK(clEnqueueNDRangeKernel(queue, mul_f32_cl, 1, NULL, &global, NULL, 1, &ev, NULL));
-            } else {
-                for (int64_t i01 = 0; i01 < ne01; i01++) {
-                    const int64_t i13 = i03%ne13;
-                    const int64_t i12 = i02%ne12;
-                    const int64_t i11 = i01%ne11;
-                    const int i1 = i13*ne12*ne11 + i12*ne11 + i11;
+            size_t global = ne00 * ne01;
+            cl_int ky = ne10 * ne11;
 
-                    cl_int x_offset = i01*ne00;
-                    cl_int y_offset = i1*ne10;
-                    cl_int d_offset = i01*ne00;
-
-                    // compute
-                    size_t global = ne00;
-                    cl_int ky = ne10;
-                    CL_CHECK(clSetKernelArg(mul_f32_cl, 0, sizeof(cl_mem), &d_X));
-                    CL_CHECK(clSetKernelArg(mul_f32_cl, 1, sizeof(cl_int), &x_offset));
-                    CL_CHECK(clSetKernelArg(mul_f32_cl, 2, sizeof(cl_mem), &d_Y));
-                    CL_CHECK(clSetKernelArg(mul_f32_cl, 3, sizeof(cl_int), &y_offset));
-                    CL_CHECK(clSetKernelArg(mul_f32_cl, 4, sizeof(cl_mem), &d_D));
-                    CL_CHECK(clSetKernelArg(mul_f32_cl, 5, sizeof(cl_int), &d_offset));
-                    CL_CHECK(clSetKernelArg(mul_f32_cl, 6, sizeof(cl_int), &ky));
-                    CL_CHECK(clEnqueueNDRangeKernel(queue, mul_f32_cl, 1, NULL, &global, NULL, 1, &ev, NULL));
-                }
-            }
+            CL_CHECK(clSetKernelArg(mul_f32_cl, 0, sizeof(cl_mem), &d_X));
+            CL_CHECK(clSetKernelArg(mul_f32_cl, 1, sizeof(cl_int), &x_offset));
+            CL_CHECK(clSetKernelArg(mul_f32_cl, 2, sizeof(cl_mem), &d_Y));
+            CL_CHECK(clSetKernelArg(mul_f32_cl, 3, sizeof(cl_int), &y_offset));
+            CL_CHECK(clSetKernelArg(mul_f32_cl, 4, sizeof(cl_mem), &d_D));
+            CL_CHECK(clSetKernelArg(mul_f32_cl, 5, sizeof(cl_int), &d_offset));
+            CL_CHECK(clSetKernelArg(mul_f32_cl, 6, sizeof(cl_int), &ky));
+            CL_CHECK(clEnqueueNDRangeKernel(queue, mul_f32_cl, 1, NULL, &global, NULL, 1, &ev, NULL));
 
             CL_CHECK(clReleaseEvent(ev));
             CL_CHECK(clFinish(queue));
@@ -1516,46 +1489,45 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
     cl_mem d_D = ggml_cl_pool_malloc(sizeof(float) * d_ne, &d_size);
 
     size_t x_offset = 0;
-    int64_t pi02 = -1;
-    int64_t pi03 = -1;
 
-    for (int64_t i13 = 0; i13 < ne13; i13++) {
-        int64_t i03 = i13 / r3;
+    for (int64_t i03 = 0; i03 < ne03; i03++) {
+        // TODO: copy src0 here when r3>1
+        for (int64_t i13 = i03 * r3, e13 = i13 + r3; i13 < e13; i13++) {
+            for (int64_t i02 = 0; i02 < ne02; i02++) {
+                if (src0->backend == GGML_BACKEND_GPU) {
+                    x_offset = (i03 * ne02 + i02) * x_ne;
+                } else {
+                    // copy src0 to device
+                    CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_X, 0, src0, i03, i02, NULL));
+                }
 
-        for (int64_t i12 = 0; i12 < ne12; i12++) {
-            int64_t i02 = i12 / r2;
+                for (int64_t i12 = i02 * r2, e12 = i12 + r2; i12 < e12; i12++) {
+                    // copy src1 to device
+                    CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i13, i12, NULL));
 
-            // copy data to device
-            if (src0->backend == GGML_BACKEND_GPU) {
-                x_offset = (i03 * ne02 + i02) * x_ne;
-            } else if (i02 != pi02 || i03 != pi03) {
-                CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_X, 0, src0, i03, i02, NULL));
-                pi02 = i02;
-                pi03 = i03;
+                    CL_CHECK(clFinish(queue));
+
+                    // compute
+                    cl_event ev_sgemm;
+                    clblast::StatusCode status = clblast::Gemm<cl_float>(clblast::Layout::kColMajor,
+                                                               clblast::Transpose::kYes, clblast::Transpose::kNo,
+                                                               ne01, ne11, ne10,
+                                                               alpha,
+                                                               d_X, x_offset, ne00,
+                                                               d_Y, 0, ne10,
+                                                               beta,
+                                                               d_D, 0, ne01,
+                                                               &queue, &ev_sgemm);
+
+                    if (status != clblast::StatusCode::kSuccess) {
+                        GGML_ASSERT(false);
+                    }
+
+                    // copy dst to host
+                    float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
+                    CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(float) * d_ne, d, 1, &ev_sgemm, NULL));
+                }
             }
-            CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i13, i12, NULL));
-
-            CL_CHECK(clFinish(queue));
-
-            // compute
-            cl_event ev_sgemm;
-            clblast::StatusCode status = clblast::Gemm<cl_float>(clblast::Layout::kColMajor,
-                                                       clblast::Transpose::kYes, clblast::Transpose::kNo,
-                                                       ne01, ne11, ne10,
-                                                       alpha,
-                                                       d_X, x_offset, ne00,
-                                                       d_Y, 0, ne10,
-                                                       beta,
-                                                       d_D, 0, ne01,
-                                                       &queue, &ev_sgemm);
-
-            if (status != clblast::StatusCode::kSuccess) {
-                GGML_ASSERT(false);
-            }
-
-            // copy dst to host
-            float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
-            CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(float) * d_ne, d, 1, &ev_sgemm, NULL));
         }
     }
 
@@ -1566,7 +1538,7 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
     ggml_cl_pool_free(d_D, d_size);
 }
 
-static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, void * wdata, size_t /* wsize */) {
+static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, void * wdata, size_t wsize) {
     GGML_ASSERT(fp16_support);
 
     const int64_t ne00 = src0->ne[0];
@@ -1596,6 +1568,10 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
     const int y_ne = ne11 * ne10;
     const int d_ne = ne11 * ne01;
 
+    GGML_ASSERT(wsize >= sizeof(ggml_fp16_t) * y_ne);
+    GGML_ASSERT(wsize >= sizeof(ggml_fp16_t) * d_ne);
+    ggml_fp16_t * const tmp = (ggml_fp16_t *) wdata;
+
     size_t x_size;
     size_t y_size;
     size_t d_size;
@@ -1612,74 +1588,70 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
     bool src1_cont_cols = (size_t)nb11 == ne11*sizeof(float);
 
     size_t x_offset = 0;
-    int64_t pi02 = -1;
-    int64_t pi03 = -1;
 
-    for (int64_t i13 = 0; i13 < ne13; i13++) {
-        int64_t i03 = i13 / r3;
-
-        for (int64_t i12 = 0; i12 < ne12; i12++) {
-            int64_t i02 = i12 / r2;
-
-            // copy src0 to device
-            if (src0->backend == GGML_BACKEND_GPU) {
-                x_offset = (i03 * ne02 + i02) * x_ne;
-            } else if (i02 != pi02 || i03 != pi03) {
-                CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_X, 0, src0, i03, i02, NULL));
-                pi02 = i02;
-                pi03 = i03;
-            }
-
-            // convert src1 to fp16
-            // TODO: use multiple threads
-            ggml_fp16_t * const tmp = (ggml_fp16_t *) wdata + (ne11 * ne10) * (i13 * ne12 + i12);
-            char * src1i = (char *) src1->data + i13*nb13 + i12*nb12;
-            if (src1_cont_rows) {
-                if (src1_cont_cols) {
-                    ggml_fp32_to_fp16_row((float *) src1i, tmp, ne10*ne11);
+    for (int64_t i03 = 0; i03 < ne03; i03++) {
+        // TODO: copy src0 here when r3>1
+        for (int64_t i13 = i03 * r3, e13 = i13 + r3; i13 < e13; i13++) {
+            for (int64_t i02 = 0; i02 < ne02; i02++) {
+                if (src0->backend == GGML_BACKEND_GPU) {
+                    x_offset = (i03 * ne02 + i02) * x_ne;
+                } else {
+                    // copy src0 to device
+                    CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_X, 0, src0, i03, i02, NULL));
                 }
-                else {
-                    for (int64_t i11 = 0; i11 < ne11; i11++) {
-                        ggml_fp32_to_fp16_row((float *) (src1i + i11*nb11), tmp + i11*ne10, ne10);
+
+                for (int64_t i12 = i02 * r2, e12 = i12 + r2; i12 < e12; i12++) {
+                    // convert src1 to fp16
+                    // TODO: use multiple threads
+                    char * src1i = (char *) src1->data + i13*nb13 + i12*nb12;
+                    if (src1_cont_rows) {
+                        if (src1_cont_cols) {
+                            ggml_fp32_to_fp16_row((float *) src1i, tmp, ne10*ne11);
+                        }
+                        else {
+                            for (int64_t i11 = 0; i11 < ne11; i11++) {
+                                ggml_fp32_to_fp16_row((float *) (src1i + i11*nb11), tmp + i11*ne10, ne10);
+                            }
+                        }
                     }
-                }
-            }
-            else {
-                for (int64_t i11 = 0; i11 < ne11; i11++) {
-                    for (int64_t i10 = 0; i10 < ne10; i10++) {
-                        // very slow due to no inlining
-                        tmp[i11*ne10 + i10] = ggml_fp32_to_fp16(*(float *) (src1i + i11*nb11 + i10*nb10));
+                    else {
+                        for (int64_t i11 = 0; i11 < ne11; i11++) {
+                            for (int64_t i10 = 0; i10 < ne10; i10++) {
+                                // very slow due to no inlining
+                                tmp[i11*ne10 + i10] = ggml_fp32_to_fp16(*(float *) (src1i + i11*nb11 + i10*nb10));
+                            }
+                        }
                     }
+
+                    // copy src1 to device
+                    CL_CHECK(clEnqueueWriteBuffer(queue, d_Y, false, 0, sizeof(ggml_fp16_t) * y_ne, tmp, 0, NULL, NULL));
+
+                    CL_CHECK(clFinish(queue));
+
+                    // compute
+                    cl_event ev_sgemm;
+                    clblast::StatusCode status = clblast::Gemm<cl_half>(clblast::Layout::kColMajor,
+                                                               clblast::Transpose::kYes, clblast::Transpose::kNo,
+                                                               ne01, ne11, ne10,
+                                                               alpha,
+                                                               d_X, x_offset, ne00,
+                                                               d_Y, 0, ne10,
+                                                               beta,
+                                                               d_D, 0, ne01,
+                                                               &queue, &ev_sgemm);
+
+                    if (status != clblast::StatusCode::kSuccess) {
+                        GGML_ASSERT(false);
+                    }
+
+                    // copy dst to host, then convert to float
+                    CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(ggml_fp16_t) * d_ne, tmp, 1, &ev_sgemm, NULL));
+
+                    float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
+
+                    ggml_fp16_to_fp32_row(tmp, d, d_ne);
                 }
             }
-
-            // copy src1 to device
-            CL_CHECK(clEnqueueWriteBuffer(queue, d_Y, false, 0, sizeof(ggml_fp16_t) * y_ne, tmp, 0, NULL, NULL));
-
-            CL_CHECK(clFinish(queue));
-
-            // compute
-            cl_event ev_sgemm;
-            clblast::StatusCode status = clblast::Gemm<cl_half>(clblast::Layout::kColMajor,
-                                                       clblast::Transpose::kYes, clblast::Transpose::kNo,
-                                                       ne01, ne11, ne10,
-                                                       alpha,
-                                                       d_X, x_offset, ne00,
-                                                       d_Y, 0, ne10,
-                                                       beta,
-                                                       d_D, 0, ne01,
-                                                       &queue, &ev_sgemm);
-
-            if (status != clblast::StatusCode::kSuccess) {
-                GGML_ASSERT(false);
-            }
-
-            // copy dst to host, then convert to float
-            CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(ggml_fp16_t) * d_ne, tmp, 1, &ev_sgemm, NULL));
-
-            float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
-
-            ggml_fp16_to_fp32_row(tmp, d, d_ne);
         }
     }
 
@@ -1704,7 +1676,7 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
     const int nb2  = dst->nb[2];
     const int nb3  = dst->nb[3];
     const ggml_type type = src0->type;
-    const bool mul_mat_vec = ne11 == 1;
+    const bool mul_mat_vec = ne11 == 1 && ne00%2 == 0;
 
     const int64_t r2 = ne12 / ne02;
     const int64_t r3 = ne13 / ne03;
@@ -1737,90 +1709,86 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
     GGML_ASSERT(to_fp32_cl != nullptr);
 
     const size_t global_denom = ggml_cl_global_denom(type);
-    const size_t local = ggml_cl_local_size(type);
+    const size_t local = mul_mat_vec ? CL_DMMV_LOCAL_SIZE : ggml_cl_local_size(type);
 
     size_t ev_idx = 0;
     std::vector<cl_event> events;
 
-    int64_t pi02 = -1;
-    int64_t pi03 = -1;
-
-    for (int64_t i13 = 0; i13 < ne13; i13++) {
-        int64_t i03 = i13 / r3;
-
-        for (int64_t i12 = 0; i12 < ne12; i12++) {
-            int64_t i02 = i12 / r2;
-
-            // copy src0 to device if necessary
-            if (src0->backend == GGML_BACKEND_CPU) {
-                if (i02 != pi02 || i03 != pi03) {
+    for (int64_t i03 = 0; i03 < ne03; i03++) {
+        // TODO: copy and dequantize src0 here when r3>1
+        for (int64_t i13 = i03 * r3, e13 = i13 + r3; i13 < e13; i13++) {
+            for (int64_t i02 = 0; i02 < ne02; i02++) {
+                // copy src0 to device if necessary
+                if (src0->backend == GGML_BACKEND_CPU) {
                     events.emplace_back();
                     CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Q, 0, src0, i03, i02, events.data() + ev_idx++));
-                    pi02 = i02;
-                    pi03 = i03;
-                }
-            } else if (src0->backend == GGML_BACKEND_GPU) {
-                d_Q = (cl_mem) src0->extra;
-            } else {
-                GGML_ASSERT(false);
-            }
-            if (mul_mat_vec) { // specialized dequantize_mul_mat_vec kernel
-                // copy src1 to device
-                events.emplace_back();
-                CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i13, i12, events.data() + ev_idx++));
-
-                // compute
-                const size_t global = ne01 * CL_DMMV_BLOCK_SIZE;
-                const size_t local = CL_DMMV_BLOCK_SIZE;
-                const cl_int ncols = ne00;
-                events.emplace_back();
-                CL_CHECK(clSetKernelArg(*dmmv, 0, sizeof(cl_mem), &d_Q));
-                CL_CHECK(clSetKernelArg(*dmmv, 1, sizeof(float) * local, NULL));
-                CL_CHECK(clSetKernelArg(*dmmv, 2, sizeof(cl_mem), &d_Y));
-                CL_CHECK(clSetKernelArg(*dmmv, 3, sizeof(cl_mem), &d_D));
-                CL_CHECK(clSetKernelArg(*dmmv, 4, sizeof(cl_int), &ncols));
-                CL_CHECK(clEnqueueNDRangeKernel(queue, *dmmv, 1, NULL, &global, &local, events.size() - 1, events.data(), events.data() + ev_idx++));
-            } else { // general dequantization kernel + CLBlast matrix matrix multiplication
-                // convert src0 to fp32 on device
-                const size_t global = x_ne / global_denom;
-                const size_t offset = src0->backend == GGML_BACKEND_GPU ? (i03 * ne02 + i02) * x_bps : 0;
-                CL_CHECK(clSetKernelArg(*to_fp32_cl, 0, sizeof(cl_mem), &d_Q));
-                CL_CHECK(clSetKernelArg(*to_fp32_cl, 1, sizeof(cl_mem), &d_X));
-                CL_CHECK(clEnqueueNDRangeKernel(queue, *to_fp32_cl, 1, offset > 0 ? &offset : NULL, &global, local > 0 ? &local : NULL, events.size(), !events.empty() ? events.data() : NULL, NULL));
-
-                // copy src1 to device
-                CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i13, i12, NULL));
-
-                events.emplace_back();
-
-                // wait for conversion
-                CL_CHECK(clFinish(queue));
-
-                // compute
-                clblast::StatusCode status = clblast::Gemm<cl_float>(clblast::Layout::kColMajor,
-                                                           clblast::Transpose::kYes, clblast::Transpose::kNo,
-                                                           ne01, ne11, ne10,
-                                                           alpha,
-                                                           d_X, 0, ne00,
-                                                           d_Y, 0, ne10,
-                                                           beta,
-                                                           d_D, 0, ne01,
-                                                           &queue, events.data() + ev_idx++);
-
-                if (status != clblast::StatusCode::kSuccess) {
+                } else if (src0->backend == GGML_BACKEND_GPU) {
+                    d_Q = (cl_mem) src0->extra;
+                } else {
                     GGML_ASSERT(false);
                 }
-            }
 
-            // copy dst to host
-            float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
-            CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(float) * d_ne, d, 1, &events[events.size() - 1], NULL));
-            for (auto *event : events) {
-                clReleaseEvent(event);
-            }
+                if (!mul_mat_vec) {
+                    // convert src0 to fp32 on device
+                    const size_t global = x_ne / global_denom;
+                    const size_t offset = src0->backend == GGML_BACKEND_GPU ? (i03 * ne02 + i02) * x_bps : 0;
+                    CL_CHECK(clSetKernelArg(*to_fp32_cl, 0, sizeof(cl_mem), &d_Q));
+                    CL_CHECK(clSetKernelArg(*to_fp32_cl, 1, sizeof(cl_mem), &d_X));
+                    CL_CHECK(clEnqueueNDRangeKernel(queue, *to_fp32_cl, 1, &offset, &global, local > 0 ? &local : NULL, events.size(), !events.empty() ? events.data() : NULL, NULL));
+                }
 
-            ev_idx = 0;
-            events.clear();
+                for (int64_t i12 = i02 * r2, e12 = i12 + r2; i12 < e12; i12++) {
+                    if (mul_mat_vec) { // specialized dequantize_mul_mat_vec kernel
+                        // copy src1 to device
+                        events.emplace_back();
+                        CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i13, i12, events.data() + ev_idx++));
+
+                        // compute
+                        const size_t global = ne01 * local;
+                        const size_t offset = src0->backend == GGML_BACKEND_GPU ? (i03 * ne02 + i02) * x_bps : 0;
+                        const cl_int ncols = ne00;
+                        events.emplace_back();
+                        CL_CHECK(clSetKernelArg(*dmmv, 0, sizeof(cl_mem), &d_Q));
+                        CL_CHECK(clSetKernelArg(*dmmv, 1, sizeof(float) * local, NULL));
+                        CL_CHECK(clSetKernelArg(*dmmv, 2, sizeof(cl_mem), &d_Y));
+                        CL_CHECK(clSetKernelArg(*dmmv, 3, sizeof(cl_mem), &d_D));
+                        CL_CHECK(clSetKernelArg(*dmmv, 4, sizeof(cl_int), &ncols));
+                        CL_CHECK(clEnqueueNDRangeKernel(queue, *dmmv, 1, &offset, &global, &local, events.size() - 1, events.data(), events.data() + ev_idx++));
+                    } else { // CLBlast matrix matrix multiplication
+                        // copy src1 to device
+                        CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i13, i12, NULL));
+
+                        // wait for conversion
+                        CL_CHECK(clFinish(queue));
+
+                        // compute
+                        events.emplace_back();
+                        clblast::StatusCode status = clblast::Gemm<cl_float>(clblast::Layout::kColMajor,
+                                                                   clblast::Transpose::kYes, clblast::Transpose::kNo,
+                                                                   ne01, ne11, ne10,
+                                                                   alpha,
+                                                                   d_X, 0, ne00,
+                                                                   d_Y, 0, ne10,
+                                                                   beta,
+                                                                   d_D, 0, ne01,
+                                                                   &queue, events.data() + ev_idx++);
+
+                        if (status != clblast::StatusCode::kSuccess) {
+                            GGML_ASSERT(false);
+                        }
+                    }
+
+                    // copy dst to host
+                    float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
+                    CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(float) * d_ne, d, 1, &events[events.size() - 1], NULL));
+                    for (auto *event : events) {
+                        clReleaseEvent(event);
+                    }
+
+                    ev_idx = 0;
+                    events.clear();
+                }
+            }
         }
     }
 
@@ -1895,8 +1863,8 @@ void ggml_cl_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor *
 }
 
 size_t ggml_cl_mul_mat_get_wsize(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
-    if (ggml_cl_mul_mat_use_f16(src0, src1, dst)) {
-        return ggml_nelements(src1) * sizeof(ggml_fp16_t);
+    if (src0->type == GGML_TYPE_F16 && ggml_cl_mul_mat_use_f16(src0, src1, dst)) {
+        return sizeof(ggml_fp16_t) * std::max(src1->ne[0] * src1->ne[1], dst->ne[0] * dst->ne[1]);
     }
     return 0;
 }
diff --git a/k_quants.c b/ggml-quants.c
similarity index 69%
rename from k_quants.c
rename to ggml-quants.c
index 558f5fda8..740be6dc5 100644
--- a/k_quants.c
+++ b/ggml-quants.c
@@ -1,9 +1,10 @@
-#include "k_quants.h"
-#include "ggml.h"
+#include "ggml-quants.h"
+#include "ggml-impl.h"
 
 #include <math.h>
 #include <string.h>
 #include <assert.h>
+#include <float.h>
 
 #ifdef __ARM_NEON
 
@@ -46,7 +47,7 @@ inline static int32_t vaddvq_s32(int32x4_t v) {
 #if defined(_MSC_VER) || defined(__MINGW32__)
 #include <intrin.h>
 #else
-#if !defined(__riscv)
+#if !defined(__riscv) && !defined(__s390__)
 #include <immintrin.h>
 #endif
 #endif
@@ -65,6 +66,1026 @@ inline static int32_t vaddvq_s32(int32x4_t v) {
 
 #define MM256_SET_M128I(a, b) _mm256_insertf128_si256(_mm256_castsi128_si256(b), (a), 1)
 
+#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__)
+// multiply int8_t, add results pairwise twice
+static inline __m128i mul_sum_i8_pairs(const __m128i x, const __m128i y) {
+    // Get absolute values of x vectors
+    const __m128i ax = _mm_sign_epi8(x, x);
+    // Sign the values of the y vectors
+    const __m128i sy = _mm_sign_epi8(y, x);
+    // Perform multiplication and create 16-bit values
+    const __m128i dot = _mm_maddubs_epi16(ax, sy);
+    const __m128i ones = _mm_set1_epi16(1);
+    return _mm_madd_epi16(ones, dot);
+}
+
+#if __AVX__ || __AVX2__ || __AVX512F__
+// horizontally add 8 floats
+static inline float hsum_float_8(const __m256 x) {
+    __m128 res = _mm256_extractf128_ps(x, 1);
+    res = _mm_add_ps(res, _mm256_castps256_ps128(x));
+    res = _mm_add_ps(res, _mm_movehl_ps(res, res));
+    res = _mm_add_ss(res, _mm_movehdup_ps(res));
+    return _mm_cvtss_f32(res);
+}
+
+// horizontally add 8 int32_t
+static inline int hsum_i32_8(const __m256i a) {
+    const __m128i sum128 = _mm_add_epi32(_mm256_castsi256_si128(a), _mm256_extractf128_si256(a, 1));
+    const __m128i hi64 = _mm_unpackhi_epi64(sum128, sum128);
+    const __m128i sum64 = _mm_add_epi32(hi64, sum128);
+    const __m128i hi32  = _mm_shuffle_epi32(sum64, _MM_SHUFFLE(2, 3, 0, 1));
+    return _mm_cvtsi128_si32(_mm_add_epi32(sum64, hi32));
+}
+
+// horizontally add 4 int32_t
+static inline int hsum_i32_4(const __m128i a) {
+    const __m128i hi64 = _mm_unpackhi_epi64(a, a);
+    const __m128i sum64 = _mm_add_epi32(hi64, a);
+    const __m128i hi32  = _mm_shuffle_epi32(sum64, _MM_SHUFFLE(2, 3, 0, 1));
+    return _mm_cvtsi128_si32(_mm_add_epi32(sum64, hi32));
+}
+
+#if defined(__AVX2__) || defined(__AVX512F__)
+// spread 32 bits to 32 bytes { 0x00, 0xFF }
+static inline __m256i bytes_from_bits_32(const uint8_t * x) {
+    uint32_t x32;
+    memcpy(&x32, x, sizeof(uint32_t));
+    const __m256i shuf_mask = _mm256_set_epi64x(
+            0x0303030303030303, 0x0202020202020202,
+            0x0101010101010101, 0x0000000000000000);
+    __m256i bytes = _mm256_shuffle_epi8(_mm256_set1_epi32(x32), shuf_mask);
+    const __m256i bit_mask = _mm256_set1_epi64x(0x7fbfdfeff7fbfdfe);
+    bytes = _mm256_or_si256(bytes, bit_mask);
+    return _mm256_cmpeq_epi8(bytes, _mm256_set1_epi64x(-1));
+}
+
+// Unpack 32 4-bit fields into 32 bytes
+// The output vector contains 32 bytes, each one in [ 0 .. 15 ] interval
+static inline __m256i bytes_from_nibbles_32(const uint8_t * rsi)
+{
+    const __m128i tmp = _mm_loadu_si128((const __m128i *)rsi);
+    const __m256i bytes = MM256_SET_M128I(_mm_srli_epi16(tmp, 4), tmp);
+    const __m256i lowMask = _mm256_set1_epi8( 0xF );
+    return _mm256_and_si256(lowMask, bytes);
+}
+
+// add int16_t pairwise and return as float vector
+static inline __m256 sum_i16_pairs_float(const __m256i x) {
+    const __m256i ones = _mm256_set1_epi16(1);
+    const __m256i summed_pairs = _mm256_madd_epi16(ones, x);
+    return _mm256_cvtepi32_ps(summed_pairs);
+}
+
+static inline __m256 mul_sum_us8_pairs_float(const __m256i ax, const __m256i sy) {
+#if __AVXVNNI__
+    const __m256i zero = _mm256_setzero_si256();
+    const __m256i summed_pairs = _mm256_dpbusd_epi32(zero, ax, sy);
+    return _mm256_cvtepi32_ps(summed_pairs);
+#else
+    // Perform multiplication and create 16-bit values
+    const __m256i dot = _mm256_maddubs_epi16(ax, sy);
+    return sum_i16_pairs_float(dot);
+#endif
+}
+
+// multiply int8_t, add results pairwise twice and return as float vector
+static inline __m256 mul_sum_i8_pairs_float(const __m256i x, const __m256i y) {
+#if __AVXVNNIINT8__
+    const __m256i zero = _mm256_setzero_si256();
+    const __m256i summed_pairs = _mm256_dpbssd_epi32(zero, x, y);
+    return _mm256_cvtepi32_ps(summed_pairs);
+#else
+    // Get absolute values of x vectors
+    const __m256i ax = _mm256_sign_epi8(x, x);
+    // Sign the values of the y vectors
+    const __m256i sy = _mm256_sign_epi8(y, x);
+    return mul_sum_us8_pairs_float(ax, sy);
+#endif
+}
+
+static inline __m128i packNibbles( __m256i bytes )
+{
+    // Move bits within 16-bit lanes from 0000_abcd_0000_efgh into 0000_0000_abcd_efgh
+#if __AVX512F__
+    const __m256i bytes_srli_4 = _mm256_srli_epi16(bytes, 4);   // 0000_0000_abcd_0000
+    bytes = _mm256_or_si256(bytes, bytes_srli_4);               // 0000_abcd_abcd_efgh
+    return _mm256_cvtepi16_epi8(bytes);                         // abcd_efgh
+#else
+    const __m256i lowByte = _mm256_set1_epi16( 0xFF );
+    __m256i high = _mm256_andnot_si256( lowByte, bytes );
+    __m256i low = _mm256_and_si256( lowByte, bytes );
+    high = _mm256_srli_epi16( high, 4 );
+    bytes = _mm256_or_si256( low, high );
+
+    // Compress uint16_t lanes into bytes
+    __m128i r0 = _mm256_castsi256_si128( bytes );
+    __m128i r1 = _mm256_extracti128_si256( bytes, 1 );
+    return _mm_packus_epi16( r0, r1 );
+#endif
+}
+#elif defined(__AVX__)
+// spread 32 bits to 32 bytes { 0x00, 0xFF }
+static inline __m256i bytes_from_bits_32(const uint8_t * x) {
+    uint32_t x32;
+    memcpy(&x32, x, sizeof(uint32_t));
+    const __m128i shuf_maskl = _mm_set_epi64x(0x0101010101010101, 0x0000000000000000);
+    const __m128i shuf_maskh = _mm_set_epi64x(0x0303030303030303, 0x0202020202020202);
+    __m128i bytesl = _mm_shuffle_epi8(_mm_set1_epi32(x32), shuf_maskl);
+    __m128i bytesh = _mm_shuffle_epi8(_mm_set1_epi32(x32), shuf_maskh);
+    const __m128i bit_mask = _mm_set1_epi64x(0x7fbfdfeff7fbfdfe);
+    bytesl = _mm_or_si128(bytesl, bit_mask);
+    bytesh = _mm_or_si128(bytesh, bit_mask);
+    bytesl = _mm_cmpeq_epi8(bytesl, _mm_set1_epi64x(-1));
+    bytesh = _mm_cmpeq_epi8(bytesh, _mm_set1_epi64x(-1));
+    return MM256_SET_M128I(bytesh, bytesl);
+}
+
+// Unpack 32 4-bit fields into 32 bytes
+// The output vector contains 32 bytes, each one in [ 0 .. 15 ] interval
+static inline __m256i bytes_from_nibbles_32(const uint8_t * rsi)
+{
+    // Load 16 bytes from memory
+    __m128i tmpl = _mm_loadu_si128((const __m128i *)rsi);
+    __m128i tmph = _mm_srli_epi16(tmpl, 4);
+    const __m128i lowMask = _mm_set1_epi8(0xF);
+    tmpl = _mm_and_si128(lowMask, tmpl);
+    tmph = _mm_and_si128(lowMask, tmph);
+    return MM256_SET_M128I(tmph, tmpl);
+}
+
+// add int16_t pairwise and return as float vector
+static inline __m256 sum_i16_pairs_float(const __m128i xh, const __m128i xl) {
+    const __m128i ones = _mm_set1_epi16(1);
+    const __m128i summed_pairsl = _mm_madd_epi16(ones, xl);
+    const __m128i summed_pairsh = _mm_madd_epi16(ones, xh);
+    const __m256i summed_pairs = MM256_SET_M128I(summed_pairsh, summed_pairsl);
+    return _mm256_cvtepi32_ps(summed_pairs);
+}
+
+static inline __m256 mul_sum_us8_pairs_float(const __m256i ax, const __m256i sy) {
+    const __m128i axl = _mm256_castsi256_si128(ax);
+    const __m128i axh = _mm256_extractf128_si256(ax, 1);
+    const __m128i syl = _mm256_castsi256_si128(sy);
+    const __m128i syh = _mm256_extractf128_si256(sy, 1);
+    // Perform multiplication and create 16-bit values
+    const __m128i dotl = _mm_maddubs_epi16(axl, syl);
+    const __m128i doth = _mm_maddubs_epi16(axh, syh);
+    return sum_i16_pairs_float(doth, dotl);
+}
+
+// multiply int8_t, add results pairwise twice and return as float vector
+static inline __m256 mul_sum_i8_pairs_float(const __m256i x, const __m256i y) {
+    const __m128i xl = _mm256_castsi256_si128(x);
+    const __m128i xh = _mm256_extractf128_si256(x, 1);
+    const __m128i yl = _mm256_castsi256_si128(y);
+    const __m128i yh = _mm256_extractf128_si256(y, 1);
+    // Get absolute values of x vectors
+    const __m128i axl = _mm_sign_epi8(xl, xl);
+    const __m128i axh = _mm_sign_epi8(xh, xh);
+    // Sign the values of the y vectors
+    const __m128i syl = _mm_sign_epi8(yl, xl);
+    const __m128i syh = _mm_sign_epi8(yh, xh);
+    // Perform multiplication and create 16-bit values
+    const __m128i dotl = _mm_maddubs_epi16(axl, syl);
+    const __m128i doth = _mm_maddubs_epi16(axh, syh);
+    return sum_i16_pairs_float(doth, dotl);
+}
+
+static inline __m128i packNibbles( __m128i bytes1, __m128i bytes2 )
+{
+    // Move bits within 16-bit lanes from 0000_abcd_0000_efgh into 0000_0000_abcd_efgh
+    const __m128i lowByte = _mm_set1_epi16( 0xFF );
+    __m128i high = _mm_andnot_si128( lowByte, bytes1 );
+    __m128i low = _mm_and_si128( lowByte, bytes1 );
+    high = _mm_srli_epi16( high, 4 );
+    bytes1 = _mm_or_si128( low, high );
+    high = _mm_andnot_si128( lowByte, bytes2 );
+    low = _mm_and_si128( lowByte, bytes2 );
+    high = _mm_srli_epi16( high, 4 );
+    bytes2 = _mm_or_si128( low, high );
+
+    return _mm_packus_epi16( bytes1, bytes2);
+}
+#endif
+#elif defined(__SSSE3__)
+// horizontally add 4x4 floats
+static inline float hsum_float_4x4(const __m128 a, const __m128 b, const __m128 c, const __m128 d) {
+    __m128 res_0 =_mm_hadd_ps(a, b);
+    __m128 res_1 =_mm_hadd_ps(c, d);
+    __m128 res =_mm_hadd_ps(res_0, res_1);
+    res =_mm_hadd_ps(res, res);
+    res =_mm_hadd_ps(res, res);
+
+    return _mm_cvtss_f32(res);
+}
+#endif // __AVX__ || __AVX2__ || __AVX512F__
+#endif // defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__)
+
+#if defined(__ARM_NEON)
+
+#if !defined(__aarch64__)
+
+inline static int32_t vaddvq_s32(int32x4_t v) {
+    return vgetq_lane_s32(v, 0) + vgetq_lane_s32(v, 1) + vgetq_lane_s32(v, 2) + vgetq_lane_s32(v, 3);
+}
+
+inline static float vaddvq_f32(float32x4_t v) {
+    return vgetq_lane_f32(v, 0) + vgetq_lane_f32(v, 1) + vgetq_lane_f32(v, 2) + vgetq_lane_f32(v, 3);
+}
+
+inline static float vmaxvq_f32(float32x4_t v) {
+    return
+        MAX(MAX(vgetq_lane_f32(v, 0), vgetq_lane_f32(v, 1)),
+            MAX(vgetq_lane_f32(v, 2), vgetq_lane_f32(v, 3)));
+}
+
+inline static int32x4_t vcvtnq_s32_f32(float32x4_t v) {
+    int32x4_t res;
+
+    res[0] = roundf(vgetq_lane_f32(v, 0));
+    res[1] = roundf(vgetq_lane_f32(v, 1));
+    res[2] = roundf(vgetq_lane_f32(v, 2));
+    res[3] = roundf(vgetq_lane_f32(v, 3));
+
+    return res;
+}
+
+#endif
+#endif
+
+#if defined(__ARM_NEON) || defined(__wasm_simd128__)
+#define B1(c,s,n)  0x ## n ## c ,  0x ## n ## s
+#define B2(c,s,n) B1(c,s,n ## c), B1(c,s,n ## s)
+#define B3(c,s,n) B2(c,s,n ## c), B2(c,s,n ## s)
+#define B4(c,s,n) B3(c,s,n ## c), B3(c,s,n ## s)
+#define B5(c,s,n) B4(c,s,n ## c), B4(c,s,n ## s)
+#define B6(c,s,n) B5(c,s,n ## c), B5(c,s,n ## s)
+#define B7(c,s,n) B6(c,s,n ## c), B6(c,s,n ## s)
+#define B8(c,s  ) B7(c,s,     c), B7(c,s,     s)
+
+// precomputed tables for expanding 8bits to 8 bytes:
+static const uint64_t table_b2b_0[1 << 8] = { B8(00, 10) }; // ( b) << 4
+static const uint64_t table_b2b_1[1 << 8] = { B8(10, 00) }; // (!b) << 4
+#endif
+
+// reference implementation for deterministic creation of model files
+void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * restrict y, int k) {
+    static const int qk = QK4_0;
+
+    assert(k % qk == 0);
+
+    const int nb = k / qk;
+
+    for (int i = 0; i < nb; i++) {
+        float amax = 0.0f; // absolute max
+        float max  = 0.0f;
+
+        for (int j = 0; j < qk; j++) {
+            const float v = x[i*qk + j];
+            if (amax < fabsf(v)) {
+                amax = fabsf(v);
+                max  = v;
+            }
+        }
+
+        const float d  = max / -8;
+        const float id = d ? 1.0f/d : 0.0f;
+
+        y[i].d = GGML_FP32_TO_FP16(d);
+
+        for (int j = 0; j < qk/2; ++j) {
+            const float x0 = x[i*qk + 0    + j]*id;
+            const float x1 = x[i*qk + qk/2 + j]*id;
+
+            const uint8_t xi0 = MIN(15, (int8_t)(x0 + 8.5f));
+            const uint8_t xi1 = MIN(15, (int8_t)(x1 + 8.5f));
+
+            y[i].qs[j]  = xi0;
+            y[i].qs[j] |= xi1 << 4;
+        }
+    }
+}
+
+void quantize_row_q4_0(const float * restrict x, void * restrict y, int k) {
+    quantize_row_q4_0_reference(x, y, k);
+}
+
+void quantize_row_q4_1_reference(const float * restrict x, block_q4_1 * restrict y, int k) {
+    const int qk = QK4_1;
+
+    assert(k % qk == 0);
+
+    const int nb = k / qk;
+
+    for (int i = 0; i < nb; i++) {
+        float min = FLT_MAX;
+        float max = -FLT_MAX;
+
+        for (int j = 0; j < qk; j++) {
+            const float v = x[i*qk + j];
+
+            if (v < min) min = v;
+            if (v > max) max = v;
+        }
+
+        const float d  = (max - min) / ((1 << 4) - 1);
+        const float id = d ? 1.0f/d : 0.0f;
+
+        y[i].d = GGML_FP32_TO_FP16(d);
+        y[i].m = GGML_FP32_TO_FP16(min);
+
+        for (int j = 0; j < qk/2; ++j) {
+            const float x0 = (x[i*qk + 0    + j] - min)*id;
+            const float x1 = (x[i*qk + qk/2 + j] - min)*id;
+
+            const uint8_t xi0 = MIN(15, (int8_t)(x0 + 0.5f));
+            const uint8_t xi1 = MIN(15, (int8_t)(x1 + 0.5f));
+
+            y[i].qs[j]  = xi0;
+            y[i].qs[j] |= xi1 << 4;
+        }
+    }
+}
+
+void quantize_row_q4_1(const float * restrict x, void * restrict y, int k) {
+    quantize_row_q4_1_reference(x, y, k);
+}
+
+void quantize_row_q5_0_reference(const float * restrict x, block_q5_0 * restrict y, int k) {
+    static const int qk = QK5_0;
+
+    assert(k % qk == 0);
+
+    const int nb = k / qk;
+
+    for (int i = 0; i < nb; i++) {
+        float amax = 0.0f; // absolute max
+        float max  = 0.0f;
+
+        for (int j = 0; j < qk; j++) {
+            const float v = x[i*qk + j];
+            if (amax < fabsf(v)) {
+                amax = fabsf(v);
+                max  = v;
+            }
+        }
+
+        const float d  = max / -16;
+        const float id = d ? 1.0f/d : 0.0f;
+
+        y[i].d = GGML_FP32_TO_FP16(d);
+
+        uint32_t qh = 0;
+
+        for (int j = 0; j < qk/2; ++j) {
+            const float x0 = x[i*qk + 0    + j]*id;
+            const float x1 = x[i*qk + qk/2 + j]*id;
+
+            const uint8_t xi0 = MIN(31, (int8_t)(x0 + 16.5f));
+            const uint8_t xi1 = MIN(31, (int8_t)(x1 + 16.5f));
+
+            y[i].qs[j] = (xi0 & 0x0F) | ((xi1 & 0x0F) << 4);
+
+            // get the 5-th bit and store it in qh at the right position
+            qh |= ((xi0 & 0x10u) >> 4) << (j + 0);
+            qh |= ((xi1 & 0x10u) >> 4) << (j + qk/2);
+        }
+
+        memcpy(&y[i].qh, &qh, sizeof(qh));
+    }
+}
+
+void quantize_row_q5_0(const float * restrict x, void * restrict y, int k) {
+    quantize_row_q5_0_reference(x, y, k);
+}
+
+void quantize_row_q5_1_reference(const float * restrict x, block_q5_1 * restrict y, int k) {
+    const int qk = QK5_1;
+
+    assert(k % qk == 0);
+
+    const int nb = k / qk;
+
+    for (int i = 0; i < nb; i++) {
+        float min = FLT_MAX;
+        float max = -FLT_MAX;
+
+        for (int j = 0; j < qk; j++) {
+            const float v = x[i*qk + j];
+
+            if (v < min) min = v;
+            if (v > max) max = v;
+        }
+
+        const float d  = (max - min) / ((1 << 5) - 1);
+        const float id = d ? 1.0f/d : 0.0f;
+
+        y[i].d = GGML_FP32_TO_FP16(d);
+        y[i].m = GGML_FP32_TO_FP16(min);
+
+        uint32_t qh = 0;
+
+        for (int j = 0; j < qk/2; ++j) {
+            const float x0 = (x[i*qk + 0    + j] - min)*id;
+            const float x1 = (x[i*qk + qk/2 + j] - min)*id;
+
+            const uint8_t xi0 = (uint8_t)(x0 + 0.5f);
+            const uint8_t xi1 = (uint8_t)(x1 + 0.5f);
+
+            y[i].qs[j] = (xi0 & 0x0F) | ((xi1 & 0x0F) << 4);
+
+            // get the 5-th bit and store it in qh at the right position
+            qh |= ((xi0 & 0x10u) >> 4) << (j + 0);
+            qh |= ((xi1 & 0x10u) >> 4) << (j + qk/2);
+        }
+
+        memcpy(&y[i].qh, &qh, sizeof(y[i].qh));
+    }
+}
+
+void quantize_row_q5_1(const float * restrict x, void * restrict y, int k) {
+    quantize_row_q5_1_reference(x, y, k);
+}
+
+// reference implementation for deterministic creation of model files
+void quantize_row_q8_0_reference(const float * restrict x, block_q8_0 * restrict y, int k) {
+    assert(k % QK8_0 == 0);
+    const int nb = k / QK8_0;
+
+    for (int i = 0; i < nb; i++) {
+        float amax = 0.0f; // absolute max
+
+        for (int j = 0; j < QK8_0; j++) {
+            const float v = x[i*QK8_0 + j];
+            amax = MAX(amax, fabsf(v));
+        }
+
+        const float d = amax / ((1 << 7) - 1);
+        const float id = d ? 1.0f/d : 0.0f;
+
+        y[i].d = GGML_FP32_TO_FP16(d);
+
+        for (int j = 0; j < QK8_0; ++j) {
+            const float x0 = x[i*QK8_0 + j]*id;
+
+            y[i].qs[j] = roundf(x0);
+        }
+    }
+}
+
+void quantize_row_q8_0(const float * restrict x, void * restrict vy, int k) {
+    assert(QK8_0 == 32);
+    assert(k % QK8_0 == 0);
+    const int nb = k / QK8_0;
+
+    block_q8_0 * restrict y = vy;
+
+#if defined(__ARM_NEON)
+    for (int i = 0; i < nb; i++) {
+        float32x4_t srcv [8];
+        float32x4_t asrcv[8];
+        float32x4_t amaxv[8];
+
+        for (int j = 0; j < 8; j++) srcv[j]  = vld1q_f32(x + i*32 + 4*j);
+        for (int j = 0; j < 8; j++) asrcv[j] = vabsq_f32(srcv[j]);
+
+        for (int j = 0; j < 4; j++) amaxv[2*j] = vmaxq_f32(asrcv[2*j], asrcv[2*j+1]);
+        for (int j = 0; j < 2; j++) amaxv[4*j] = vmaxq_f32(amaxv[4*j], amaxv[4*j+2]);
+        for (int j = 0; j < 1; j++) amaxv[8*j] = vmaxq_f32(amaxv[8*j], amaxv[8*j+4]);
+
+        const float amax = vmaxvq_f32(amaxv[0]);
+
+        const float d = amax / ((1 << 7) - 1);
+        const float id = d ? 1.0f/d : 0.0f;
+
+        y[i].d = GGML_FP32_TO_FP16(d);
+
+        for (int j = 0; j < 8; j++) {
+            const float32x4_t v  = vmulq_n_f32(srcv[j], id);
+            const int32x4_t   vi = vcvtnq_s32_f32(v);
+
+            y[i].qs[4*j + 0] = vgetq_lane_s32(vi, 0);
+            y[i].qs[4*j + 1] = vgetq_lane_s32(vi, 1);
+            y[i].qs[4*j + 2] = vgetq_lane_s32(vi, 2);
+            y[i].qs[4*j + 3] = vgetq_lane_s32(vi, 3);
+        }
+    }
+#elif defined(__wasm_simd128__)
+    for (int i = 0; i < nb; i++) {
+        v128_t srcv [8];
+        v128_t asrcv[8];
+        v128_t amaxv[8];
+
+        for (int j = 0; j < 8; j++) srcv[j]  = wasm_v128_load(x + i*32 + 4*j);
+        for (int j = 0; j < 8; j++) asrcv[j] = wasm_f32x4_abs(srcv[j]);
+
+        for (int j = 0; j < 4; j++) amaxv[2*j] = wasm_f32x4_max(asrcv[2*j], asrcv[2*j+1]);
+        for (int j = 0; j < 2; j++) amaxv[4*j] = wasm_f32x4_max(amaxv[4*j], amaxv[4*j+2]);
+        for (int j = 0; j < 1; j++) amaxv[8*j] = wasm_f32x4_max(amaxv[8*j], amaxv[8*j+4]);
+
+        const float amax = MAX(MAX(wasm_f32x4_extract_lane(amaxv[0], 0),
+                                   wasm_f32x4_extract_lane(amaxv[0], 1)),
+                               MAX(wasm_f32x4_extract_lane(amaxv[0], 2),
+                                   wasm_f32x4_extract_lane(amaxv[0], 3)));
+
+        const float d = amax / ((1 << 7) - 1);
+        const float id = d ? 1.0f/d : 0.0f;
+
+        y[i].d = GGML_FP32_TO_FP16(d);
+
+        for (int j = 0; j < 8; j++) {
+            const v128_t v  = wasm_f32x4_mul(srcv[j], wasm_f32x4_splat(id));
+            const v128_t vi = wasm_i32x4_trunc_sat_f32x4(v);
+
+            y[i].qs[4*j + 0] = wasm_i32x4_extract_lane(vi, 0);
+            y[i].qs[4*j + 1] = wasm_i32x4_extract_lane(vi, 1);
+            y[i].qs[4*j + 2] = wasm_i32x4_extract_lane(vi, 2);
+            y[i].qs[4*j + 3] = wasm_i32x4_extract_lane(vi, 3);
+        }
+    }
+#elif defined(__AVX2__) || defined(__AVX__)
+    for (int i = 0; i < nb; i++) {
+        // Load elements into 4 AVX vectors
+        __m256 v0 = _mm256_loadu_ps( x );
+        __m256 v1 = _mm256_loadu_ps( x + 8 );
+        __m256 v2 = _mm256_loadu_ps( x + 16 );
+        __m256 v3 = _mm256_loadu_ps( x + 24 );
+        x += 32;
+
+        // Compute max(abs(e)) for the block
+        const __m256 signBit = _mm256_set1_ps( -0.0f );
+        __m256 maxAbs = _mm256_andnot_ps( signBit, v0 );
+        maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v1 ) );
+        maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v2 ) );
+        maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v3 ) );
+
+        __m128 max4 = _mm_max_ps( _mm256_extractf128_ps( maxAbs, 1 ), _mm256_castps256_ps128( maxAbs ) );
+        max4 = _mm_max_ps( max4, _mm_movehl_ps( max4, max4 ) );
+        max4 = _mm_max_ss( max4, _mm_movehdup_ps( max4 ) );
+        const float maxScalar = _mm_cvtss_f32( max4 );
+
+        // Quantize these floats
+        const float d = maxScalar / 127.f;
+        y[i].d = GGML_FP32_TO_FP16(d);
+        const float id = ( maxScalar != 0.0f ) ? 127.f / maxScalar : 0.0f;
+        const __m256 mul = _mm256_set1_ps( id );
+
+        // Apply the multiplier
+        v0 = _mm256_mul_ps( v0, mul );
+        v1 = _mm256_mul_ps( v1, mul );
+        v2 = _mm256_mul_ps( v2, mul );
+        v3 = _mm256_mul_ps( v3, mul );
+
+        // Round to nearest integer
+        v0 = _mm256_round_ps( v0, _MM_ROUND_NEAREST );
+        v1 = _mm256_round_ps( v1, _MM_ROUND_NEAREST );
+        v2 = _mm256_round_ps( v2, _MM_ROUND_NEAREST );
+        v3 = _mm256_round_ps( v3, _MM_ROUND_NEAREST );
+
+        // Convert floats to integers
+        __m256i i0 = _mm256_cvtps_epi32( v0 );
+        __m256i i1 = _mm256_cvtps_epi32( v1 );
+        __m256i i2 = _mm256_cvtps_epi32( v2 );
+        __m256i i3 = _mm256_cvtps_epi32( v3 );
+
+#if defined(__AVX2__)
+        // Convert int32 to int16
+        i0 = _mm256_packs_epi32( i0, i1 );	// 0, 1, 2, 3,  8, 9, 10, 11,  4, 5, 6, 7, 12, 13, 14, 15
+        i2 = _mm256_packs_epi32( i2, i3 );	// 16, 17, 18, 19,  24, 25, 26, 27,  20, 21, 22, 23, 28, 29, 30, 31
+                                            // Convert int16 to int8
+        i0 = _mm256_packs_epi16( i0, i2 );	// 0, 1, 2, 3,  8, 9, 10, 11,  16, 17, 18, 19,  24, 25, 26, 27,  4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31
+
+        // We got our precious signed bytes, but the order is now wrong
+        // These AVX2 pack instructions process 16-byte pieces independently
+        // The following instruction is fixing the order
+        const __m256i perm = _mm256_setr_epi32( 0, 4, 1, 5, 2, 6, 3, 7 );
+        i0 = _mm256_permutevar8x32_epi32( i0, perm );
+
+        _mm256_storeu_si256((__m256i *)y[i].qs, i0);
+#else
+        // Since we don't have in AVX some necessary functions,
+        // we split the registers in half and call AVX2 analogs from SSE
+        __m128i ni0 = _mm256_castsi256_si128( i0 );
+        __m128i ni1 = _mm256_extractf128_si256( i0, 1);
+        __m128i ni2 = _mm256_castsi256_si128( i1 );
+        __m128i ni3 = _mm256_extractf128_si256( i1, 1);
+        __m128i ni4 = _mm256_castsi256_si128( i2 );
+        __m128i ni5 = _mm256_extractf128_si256( i2, 1);
+        __m128i ni6 = _mm256_castsi256_si128( i3 );
+        __m128i ni7 = _mm256_extractf128_si256( i3, 1);
+
+        // Convert int32 to int16
+        ni0 = _mm_packs_epi32( ni0, ni1 );
+        ni2 = _mm_packs_epi32( ni2, ni3 );
+        ni4 = _mm_packs_epi32( ni4, ni5 );
+        ni6 = _mm_packs_epi32( ni6, ni7 );
+        // Convert int16 to int8
+        ni0 = _mm_packs_epi16( ni0, ni2 );
+        ni4 = _mm_packs_epi16( ni4, ni6 );
+
+        _mm_storeu_si128((__m128i *)(y[i].qs +  0), ni0);
+        _mm_storeu_si128((__m128i *)(y[i].qs + 16), ni4);
+#endif
+    }
+#elif defined(__riscv_v_intrinsic)
+
+    size_t vl = __riscv_vsetvl_e32m4(QK8_0);
+
+    for (int i = 0; i < nb; i++) {
+        // load elements
+        vfloat32m4_t v_x   = __riscv_vle32_v_f32m4(x+i*QK8_0, vl);
+
+        vfloat32m4_t vfabs = __riscv_vfabs_v_f32m4(v_x, vl);
+        vfloat32m1_t tmp   = __riscv_vfmv_v_f_f32m1(0.0f, vl);
+        vfloat32m1_t vmax  = __riscv_vfredmax_vs_f32m4_f32m1(vfabs, tmp, vl);
+        float amax = __riscv_vfmv_f_s_f32m1_f32(vmax);
+
+        const float d = amax / ((1 << 7) - 1);
+        const float id = d ? 1.0f/d : 0.0f;
+
+        y[i].d = GGML_FP32_TO_FP16(d);
+
+        vfloat32m4_t x0 = __riscv_vfmul_vf_f32m4(v_x, id, vl);
+
+        // convert to integer
+        vint16m2_t   vi = __riscv_vfncvt_x_f_w_i16m2(x0, vl);
+        vint8m1_t    vs = __riscv_vncvt_x_x_w_i8m1(vi, vl);
+
+        // store result
+        __riscv_vse8_v_i8m1(y[i].qs , vs, vl);
+    }
+#else
+    GGML_UNUSED(nb);
+    // scalar
+    quantize_row_q8_0_reference(x, y, k);
+#endif
+}
+
+// reference implementation for deterministic creation of model files
+void quantize_row_q8_1_reference(const float * restrict x, block_q8_1 * restrict y, int k) {
+    assert(QK8_1 == 32);
+    assert(k % QK8_1 == 0);
+    const int nb = k / QK8_1;
+
+    for (int i = 0; i < nb; i++) {
+        float amax = 0.0f; // absolute max
+
+        for (int j = 0; j < QK8_1; j++) {
+            const float v = x[i*QK8_1 + j];
+            amax = MAX(amax, fabsf(v));
+        }
+
+        const float d = amax / ((1 << 7) - 1);
+        const float id = d ? 1.0f/d : 0.0f;
+
+        y[i].d = d;
+
+        int sum = 0;
+
+        for (int j = 0; j < QK8_1/2; ++j) {
+            const float v0 = x[i*QK8_1           + j]*id;
+            const float v1 = x[i*QK8_1 + QK8_1/2 + j]*id;
+
+            y[i].qs[          j] = roundf(v0);
+            y[i].qs[QK8_1/2 + j] = roundf(v1);
+
+            sum += y[i].qs[          j];
+            sum += y[i].qs[QK8_1/2 + j];
+        }
+
+        y[i].s = sum*d;
+    }
+}
+
+void quantize_row_q8_1(const float * restrict x, void * restrict vy, int k) {
+    assert(k % QK8_1 == 0);
+    const int nb = k / QK8_1;
+
+    block_q8_1 * restrict y = vy;
+
+#if defined(__ARM_NEON)
+    for (int i = 0; i < nb; i++) {
+        float32x4_t srcv [8];
+        float32x4_t asrcv[8];
+        float32x4_t amaxv[8];
+
+        for (int j = 0; j < 8; j++) srcv[j]  = vld1q_f32(x + i*32 + 4*j);
+        for (int j = 0; j < 8; j++) asrcv[j] = vabsq_f32(srcv[j]);
+
+        for (int j = 0; j < 4; j++) amaxv[2*j] = vmaxq_f32(asrcv[2*j], asrcv[2*j+1]);
+        for (int j = 0; j < 2; j++) amaxv[4*j] = vmaxq_f32(amaxv[4*j], amaxv[4*j+2]);
+        for (int j = 0; j < 1; j++) amaxv[8*j] = vmaxq_f32(amaxv[8*j], amaxv[8*j+4]);
+
+        const float amax = vmaxvq_f32(amaxv[0]);
+
+        const float d = amax / ((1 << 7) - 1);
+        const float id = d ? 1.0f/d : 0.0f;
+
+        y[i].d = d;
+
+        int32x4_t accv = vdupq_n_s32(0);
+
+        for (int j = 0; j < 8; j++) {
+            const float32x4_t v  = vmulq_n_f32(srcv[j], id);
+            const int32x4_t   vi = vcvtnq_s32_f32(v);
+
+            y[i].qs[4*j + 0] = vgetq_lane_s32(vi, 0);
+            y[i].qs[4*j + 1] = vgetq_lane_s32(vi, 1);
+            y[i].qs[4*j + 2] = vgetq_lane_s32(vi, 2);
+            y[i].qs[4*j + 3] = vgetq_lane_s32(vi, 3);
+
+            accv = vaddq_s32(accv, vi);
+        }
+
+        y[i].s = d * vaddvq_s32(accv);
+    }
+#elif defined(__wasm_simd128__)
+    for (int i = 0; i < nb; i++) {
+        v128_t srcv [8];
+        v128_t asrcv[8];
+        v128_t amaxv[8];
+
+        for (int j = 0; j < 8; j++) srcv[j]  = wasm_v128_load(x + i*32 + 4*j);
+        for (int j = 0; j < 8; j++) asrcv[j] = wasm_f32x4_abs(srcv[j]);
+
+        for (int j = 0; j < 4; j++) amaxv[2*j] = wasm_f32x4_max(asrcv[2*j], asrcv[2*j+1]);
+        for (int j = 0; j < 2; j++) amaxv[4*j] = wasm_f32x4_max(amaxv[4*j], amaxv[4*j+2]);
+        for (int j = 0; j < 1; j++) amaxv[8*j] = wasm_f32x4_max(amaxv[8*j], amaxv[8*j+4]);
+
+        const float amax = MAX(MAX(wasm_f32x4_extract_lane(amaxv[0], 0),
+                                   wasm_f32x4_extract_lane(amaxv[0], 1)),
+                               MAX(wasm_f32x4_extract_lane(amaxv[0], 2),
+                                   wasm_f32x4_extract_lane(amaxv[0], 3)));
+
+        const float d = amax / ((1 << 7) - 1);
+        const float id = d ? 1.0f/d : 0.0f;
+
+        y[i].d = d;
+
+        v128_t accv = wasm_i32x4_splat(0);
+
+        for (int j = 0; j < 8; j++) {
+            const v128_t v  = wasm_f32x4_mul(srcv[j], wasm_f32x4_splat(id));
+            const v128_t vi = wasm_i32x4_trunc_sat_f32x4(v);
+
+            y[i].qs[4*j + 0] = wasm_i32x4_extract_lane(vi, 0);
+            y[i].qs[4*j + 1] = wasm_i32x4_extract_lane(vi, 1);
+            y[i].qs[4*j + 2] = wasm_i32x4_extract_lane(vi, 2);
+            y[i].qs[4*j + 3] = wasm_i32x4_extract_lane(vi, 3);
+
+            accv = wasm_i32x4_add(accv, vi);
+        }
+
+        y[i].s = d * (wasm_i32x4_extract_lane(accv, 0) +
+                      wasm_i32x4_extract_lane(accv, 1) +
+                      wasm_i32x4_extract_lane(accv, 2) +
+                      wasm_i32x4_extract_lane(accv, 3));
+    }
+#elif defined(__AVX2__) || defined(__AVX__)
+    for (int i = 0; i < nb; i++) {
+        // Load elements into 4 AVX vectors
+        __m256 v0 = _mm256_loadu_ps( x );
+        __m256 v1 = _mm256_loadu_ps( x + 8 );
+        __m256 v2 = _mm256_loadu_ps( x + 16 );
+        __m256 v3 = _mm256_loadu_ps( x + 24 );
+        x += 32;
+
+        // Compute max(abs(e)) for the block
+        const __m256 signBit = _mm256_set1_ps( -0.0f );
+        __m256 maxAbs = _mm256_andnot_ps( signBit, v0 );
+        maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v1 ) );
+        maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v2 ) );
+        maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v3 ) );
+
+        __m128 max4 = _mm_max_ps( _mm256_extractf128_ps( maxAbs, 1 ), _mm256_castps256_ps128( maxAbs ) );
+        max4 = _mm_max_ps( max4, _mm_movehl_ps( max4, max4 ) );
+        max4 = _mm_max_ss( max4, _mm_movehdup_ps( max4 ) );
+        const float maxScalar = _mm_cvtss_f32( max4 );
+
+        // Quantize these floats
+        const float d = maxScalar / 127.f;
+        y[i].d = d;
+        const float id = ( maxScalar != 0.0f ) ? 127.f / maxScalar : 0.0f;
+        const __m256 mul = _mm256_set1_ps( id );
+
+        // Apply the multiplier
+        v0 = _mm256_mul_ps( v0, mul );
+        v1 = _mm256_mul_ps( v1, mul );
+        v2 = _mm256_mul_ps( v2, mul );
+        v3 = _mm256_mul_ps( v3, mul );
+
+        // Round to nearest integer
+        v0 = _mm256_round_ps( v0, _MM_ROUND_NEAREST );
+        v1 = _mm256_round_ps( v1, _MM_ROUND_NEAREST );
+        v2 = _mm256_round_ps( v2, _MM_ROUND_NEAREST );
+        v3 = _mm256_round_ps( v3, _MM_ROUND_NEAREST );
+
+        // Convert floats to integers
+        __m256i i0 = _mm256_cvtps_epi32( v0 );
+        __m256i i1 = _mm256_cvtps_epi32( v1 );
+        __m256i i2 = _mm256_cvtps_epi32( v2 );
+        __m256i i3 = _mm256_cvtps_epi32( v3 );
+
+#if defined(__AVX2__)
+        // Compute the sum of the quants and set y[i].s
+        y[i].s = d * hsum_i32_8(_mm256_add_epi32(_mm256_add_epi32(i0, i1), _mm256_add_epi32(i2, i3)));
+
+        // Convert int32 to int16
+        i0 = _mm256_packs_epi32( i0, i1 );	// 0, 1, 2, 3,  8, 9, 10, 11,  4, 5, 6, 7, 12, 13, 14, 15
+        i2 = _mm256_packs_epi32( i2, i3 );	// 16, 17, 18, 19,  24, 25, 26, 27,  20, 21, 22, 23, 28, 29, 30, 31
+                                            // Convert int16 to int8
+        i0 = _mm256_packs_epi16( i0, i2 );	// 0, 1, 2, 3,  8, 9, 10, 11,  16, 17, 18, 19,  24, 25, 26, 27,  4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31
+
+        // We got our precious signed bytes, but the order is now wrong
+        // These AVX2 pack instructions process 16-byte pieces independently
+        // The following instruction is fixing the order
+        const __m256i perm = _mm256_setr_epi32( 0, 4, 1, 5, 2, 6, 3, 7 );
+        i0 = _mm256_permutevar8x32_epi32( i0, perm );
+
+        _mm256_storeu_si256((__m256i *)y[i].qs, i0);
+#else
+        // Since we don't have in AVX some necessary functions,
+        // we split the registers in half and call AVX2 analogs from SSE
+        __m128i ni0 = _mm256_castsi256_si128( i0 );
+        __m128i ni1 = _mm256_extractf128_si256( i0, 1);
+        __m128i ni2 = _mm256_castsi256_si128( i1 );
+        __m128i ni3 = _mm256_extractf128_si256( i1, 1);
+        __m128i ni4 = _mm256_castsi256_si128( i2 );
+        __m128i ni5 = _mm256_extractf128_si256( i2, 1);
+        __m128i ni6 = _mm256_castsi256_si128( i3 );
+        __m128i ni7 = _mm256_extractf128_si256( i3, 1);
+
+        // Compute the sum of the quants and set y[i].s
+        const __m128i s0 = _mm_add_epi32(_mm_add_epi32(ni0, ni1), _mm_add_epi32(ni2, ni3));
+        const __m128i s1 = _mm_add_epi32(_mm_add_epi32(ni4, ni5), _mm_add_epi32(ni6, ni7));
+        y[i].s = d * hsum_i32_4(_mm_add_epi32(s0, s1));
+
+        // Convert int32 to int16
+        ni0 = _mm_packs_epi32( ni0, ni1 );
+        ni2 = _mm_packs_epi32( ni2, ni3 );
+        ni4 = _mm_packs_epi32( ni4, ni5 );
+        ni6 = _mm_packs_epi32( ni6, ni7 );
+        // Convert int16 to int8
+        ni0 = _mm_packs_epi16( ni0, ni2 );
+        ni4 = _mm_packs_epi16( ni4, ni6 );
+
+        _mm_storeu_si128((__m128i *)(y[i].qs +  0), ni0);
+        _mm_storeu_si128((__m128i *)(y[i].qs + 16), ni4);
+#endif
+    }
+#elif defined(__riscv_v_intrinsic)
+
+    size_t vl = __riscv_vsetvl_e32m4(QK8_1);
+
+    for (int i = 0; i < nb; i++) {
+        // load elements
+        vfloat32m4_t v_x   = __riscv_vle32_v_f32m4(x+i*QK8_1, vl);
+
+        vfloat32m4_t vfabs = __riscv_vfabs_v_f32m4(v_x, vl);
+        vfloat32m1_t tmp   = __riscv_vfmv_v_f_f32m1(0.0, vl);
+        vfloat32m1_t vmax  = __riscv_vfredmax_vs_f32m4_f32m1(vfabs, tmp, vl);
+        float amax = __riscv_vfmv_f_s_f32m1_f32(vmax);
+
+        const float d  = amax / ((1 << 7) - 1);
+        const float id = d ? 1.0f/d : 0.0f;
+
+        y[i].d = d;
+
+        vfloat32m4_t x0 = __riscv_vfmul_vf_f32m4(v_x, id, vl);
+
+        // convert to integer
+        vint16m2_t   vi = __riscv_vfncvt_x_f_w_i16m2(x0, vl);
+        vint8m1_t    vs = __riscv_vncvt_x_x_w_i8m1(vi, vl);
+
+        // store result
+        __riscv_vse8_v_i8m1(y[i].qs , vs, vl);
+
+        // compute sum for y[i].s
+        vint16m1_t tmp2 = __riscv_vmv_v_x_i16m1(0, vl);
+        vint16m1_t vwrs = __riscv_vwredsum_vs_i8m1_i16m1(vs, tmp2, vl);
+
+        // set y[i].s
+        int sum = __riscv_vmv_x_s_i16m1_i16(vwrs);
+        y[i].s = sum*d;
+    }
+#else
+    GGML_UNUSED(nb);
+    // scalar
+    quantize_row_q8_1_reference(x, y, k);
+#endif
+}
+
+void dequantize_row_q4_0(const block_q4_0 * restrict x, float * restrict y, int k) {
+    static const int qk = QK4_0;
+
+    assert(k % qk == 0);
+
+    const int nb = k / qk;
+
+    for (int i = 0; i < nb; i++) {
+        const float d = GGML_FP16_TO_FP32(x[i].d);
+
+        for (int j = 0; j < qk/2; ++j) {
+            const int x0 = (x[i].qs[j] & 0x0F) - 8;
+            const int x1 = (x[i].qs[j] >>   4) - 8;
+
+            y[i*qk + j + 0   ] = x0*d;
+            y[i*qk + j + qk/2] = x1*d;
+        }
+    }
+}
+
+void dequantize_row_q4_1(const block_q4_1 * restrict x, float * restrict y, int k) {
+    static const int qk = QK4_1;
+
+    assert(k % qk == 0);
+
+    const int nb = k / qk;
+
+    for (int i = 0; i < nb; i++) {
+        const float d = GGML_FP16_TO_FP32(x[i].d);
+        const float m = GGML_FP16_TO_FP32(x[i].m);
+
+        for (int j = 0; j < qk/2; ++j) {
+            const int x0 = (x[i].qs[j] & 0x0F);
+            const int x1 = (x[i].qs[j] >>   4);
+
+            y[i*qk + j + 0   ] = x0*d + m;
+            y[i*qk + j + qk/2] = x1*d + m;
+        }
+    }
+}
+
+void dequantize_row_q5_0(const block_q5_0 * restrict x, float * restrict y, int k) {
+    static const int qk = QK5_0;
+
+    assert(k % qk == 0);
+
+    const int nb = k / qk;
+
+    for (int i = 0; i < nb; i++) {
+        const float d = GGML_FP16_TO_FP32(x[i].d);
+
+        uint32_t qh;
+        memcpy(&qh, x[i].qh, sizeof(qh));
+
+        for (int j = 0; j < qk/2; ++j) {
+            const uint8_t xh_0 = ((qh >> (j +  0)) << 4) & 0x10;
+            const uint8_t xh_1 = ((qh >> (j + 12))     ) & 0x10;
+
+            const int32_t x0 = ((x[i].qs[j] & 0x0F) | xh_0) - 16;
+            const int32_t x1 = ((x[i].qs[j] >>   4) | xh_1) - 16;
+
+            y[i*qk + j + 0   ] = x0*d;
+            y[i*qk + j + qk/2] = x1*d;
+        }
+    }
+}
+
+void dequantize_row_q5_1(const block_q5_1 * restrict x, float * restrict y, int k) {
+    static const int qk = QK5_1;
+
+    assert(k % qk == 0);
+
+    const int nb = k / qk;
+
+    for (int i = 0; i < nb; i++) {
+        const float d = GGML_FP16_TO_FP32(x[i].d);
+        const float m = GGML_FP16_TO_FP32(x[i].m);
+
+        uint32_t qh;
+        memcpy(&qh, x[i].qh, sizeof(qh));
+
+        for (int j = 0; j < qk/2; ++j) {
+            const uint8_t xh_0 = ((qh >> (j +  0)) << 4) & 0x10;
+            const uint8_t xh_1 = ((qh >> (j + 12))     ) & 0x10;
+
+            const int x0 = (x[i].qs[j] & 0x0F) | xh_0;
+            const int x1 = (x[i].qs[j] >>   4) | xh_1;
+
+            y[i*qk + j + 0   ] = x0*d + m;
+            y[i*qk + j + qk/2] = x1*d + m;
+        }
+    }
+}
+
+void dequantize_row_q8_0(const block_q8_0 * restrict x, float * restrict y, int k) {
+    static const int qk = QK8_0;
+
+    assert(k % qk == 0);
+
+    const int nb = k / qk;
+
+    for (int i = 0; i < nb; i++) {
+        const float d = GGML_FP16_TO_FP32(x[i].d);
+
+        for (int j = 0; j < qk; ++j) {
+            y[i*qk + j] = x[i].qs[j]*d;
+        }
+    }
+}
+
 //
 // 2-6 bit quantization in super-blocks
 //
@@ -368,10 +1389,10 @@ void quantize_row_q2_K_reference(const float * restrict x, block_q2_K * restrict
                 int l = nearest_int(iscale*scales[j]);
                 y[i].scales[j] = l;
             }
-            y[i].d = ggml_fp32_to_fp16(max_scale/q4scale);
+            y[i].d = GGML_FP32_TO_FP16(max_scale/q4scale);
         } else {
             for (int j = 0; j < QK_K/16; ++j) y[i].scales[j] = 0;
-            y[i].d = ggml_fp32_to_fp16(0.f);
+            y[i].d = GGML_FP32_TO_FP16(0.f);
         }
         if (max_min > 0) {
             float iscale = q4scale/max_min;
@@ -379,14 +1400,14 @@ void quantize_row_q2_K_reference(const float * restrict x, block_q2_K * restrict
                 int l = nearest_int(iscale*mins[j]);
                 y[i].scales[j] |= (l << 4);
             }
-            y[i].dmin = ggml_fp32_to_fp16(max_min/q4scale);
+            y[i].dmin = GGML_FP32_TO_FP16(max_min/q4scale);
         } else {
-            y[i].dmin = ggml_fp32_to_fp16(0.f);
+            y[i].dmin = GGML_FP32_TO_FP16(0.f);
         }
         for (int j = 0; j < QK_K/16; ++j) {
-            const float d = ggml_fp16_to_fp32(y[i].d) * (y[i].scales[j] & 0xF);
+            const float d = GGML_FP16_TO_FP32(y[i].d) * (y[i].scales[j] & 0xF);
             if (!d) continue;
-            const float dm = ggml_fp16_to_fp32(y[i].dmin) * (y[i].scales[j] >> 4);
+            const float dm = GGML_FP16_TO_FP32(y[i].dmin) * (y[i].scales[j] >> 4);
             for (int ii = 0; ii < 16; ++ii) {
                 int l = nearest_int((x[16*j + ii] + dm)/d);
                 l = MAX(0, MIN(3, l));
@@ -417,8 +1438,8 @@ void dequantize_row_q2_K(const block_q2_K * restrict x, float * restrict y, int
 
     for (int i = 0; i < nb; i++) {
 
-        const float d = ggml_fp16_to_fp32(x[i].d);
-        const float min = ggml_fp16_to_fp32(x[i].dmin);
+        const float d = GGML_FP16_TO_FP32(x[i].d);
+        const float min = GGML_FP16_TO_FP32(x[i].dmin);
 
         const uint8_t * q = x[i].qs;
 
@@ -462,12 +1483,9 @@ void quantize_row_q2_K(const float * restrict x, void * restrict vy, int k) {
 }
 
 size_t ggml_quantize_q2_K(const float * restrict src, void * restrict dst, int n, int k, int64_t * restrict hist) {
-    const int nb = k / QK_K;
+    (void)hist; // TODO: collect histograms
 
-    // TODO - collect histograms - although, at a second thought, I don't really care about them
-    (void)hist;
-
-    for (int j = 0; j < nb; j += k) {
+    for (int j = 0; j < n; j += k) {
         block_q2_K * restrict y = (block_q2_K *)dst + j/QK_K;
         quantize_row_q2_K_reference(src + j, y, k);
     }
@@ -510,16 +1528,16 @@ void quantize_row_q3_K_reference(const float * restrict x, block_q3_K * restrict
                 l >>= 4;
                 y[i].scales[j%4 + 8] |= (l << (2*(j/4)));
             }
-            y[i].d = ggml_fp32_to_fp16(1/iscale);
+            y[i].d = GGML_FP32_TO_FP16(1/iscale);
         } else {
-            y[i].d = ggml_fp32_to_fp16(0.f);
+            y[i].d = GGML_FP32_TO_FP16(0.f);
         }
 
         int8_t sc;
         for (int j = 0; j < QK_K/16; ++j) {
             sc = j < 8 ? y[i].scales[j] & 0xF : y[i].scales[j-8] >> 4;
             sc = (sc | (((y[i].scales[8 + j%4] >> (2*(j/4))) & 3) << 4)) - 32;
-            float d = ggml_fp16_to_fp32(y[i].d) * sc;
+            float d = GGML_FP16_TO_FP32(y[i].d) * sc;
             if (!d) {
                 continue;
             }
@@ -539,16 +1557,16 @@ void quantize_row_q3_K_reference(const float * restrict x, block_q3_K * restrict
                 l2 = 8 + MAX(-8, MIN(7, l2));
                 y[i].scales[j/2] = l1 | (l2 << 4);
             }
-            y[i].d = ggml_fp32_to_fp16(1/iscale);
+            y[i].d = GGML_FP32_TO_FP16(1/iscale);
         } else {
             for (int j = 0; j < QK_K/16; j+=2) {
                 y[i].scales[j/2] = 0;
             }
-            y[i].d = ggml_fp32_to_fp16(0.f);
+            y[i].d = GGML_FP32_TO_FP16(0.f);
         }
         for (int j = 0; j < QK_K/16; ++j) {
             int s = j%2 == 0 ? y[i].scales[j/2] & 0xF : y[i].scales[j/2] >> 4;
-            float d = ggml_fp16_to_fp32(y[i].d) * (s - 8);
+            float d = GGML_FP16_TO_FP32(y[i].d) * (s - 8);
             if (!d) {
                 continue;
             }
@@ -602,7 +1620,7 @@ void dequantize_row_q3_K(const block_q3_K * restrict x, float * restrict y, int
 
     for (int i = 0; i < nb; i++) {
 
-        const float d_all = ggml_fp16_to_fp32(x[i].d);
+        const float d_all = GGML_FP16_TO_FP32(x[i].d);
 
         const uint8_t * restrict q = x[i].qs;
         const uint8_t * restrict hm = x[i].hmask;
@@ -647,7 +1665,7 @@ void dequantize_row_q3_K(const block_q3_K * restrict x, float * restrict y, int
 
     for (int i = 0; i < nb; i++) {
 
-        const float d_all = ggml_fp16_to_fp32(x[i].d);
+        const float d_all = GGML_FP16_TO_FP32(x[i].d);
 
         const uint8_t * restrict q = x[i].qs;
         const uint8_t * restrict hm = x[i].hmask;
@@ -678,12 +1696,9 @@ void quantize_row_q3_K(const float * restrict x, void * restrict vy, int k) {
 }
 
 size_t ggml_quantize_q3_K(const float * restrict src, void * restrict dst, int n, int k, int64_t * restrict hist) {
-    const int nb = k / QK_K;
+    (void)hist; // TODO: collect histograms
 
-    // TODO - collect histograms - although, at a second thought, I don't really care about them
-    (void)hist;
-
-    for (int j = 0; j < nb; j += k) {
+    for (int j = 0; j < n; j += k) {
         block_q3_K * restrict y = (block_q3_K *)dst + j/QK_K;
         quantize_row_q3_K_reference(src + j, y, k);
     }
@@ -740,15 +1755,15 @@ void quantize_row_q4_K_reference(const float * restrict x, block_q4_K * restrict
                 y[i].scales[j-0] |= ((lm >> 4) << 6);
             }
         }
-        y[i].d = ggml_fp32_to_fp16(max_scale/63.f);
-        y[i].dmin = ggml_fp32_to_fp16(max_min/63.f);
+        y[i].d = GGML_FP32_TO_FP16(max_scale/63.f);
+        y[i].dmin = GGML_FP32_TO_FP16(max_min/63.f);
 
         uint8_t sc, m;
         for (int j = 0; j < QK_K/32; ++j) {
             get_scale_min_k4(j, y[i].scales, &sc, &m);
-            const float d = ggml_fp16_to_fp32(y[i].d) * sc;
+            const float d = GGML_FP16_TO_FP32(y[i].d) * sc;
             if (!d) continue;
-            const float dm = ggml_fp16_to_fp32(y[i].dmin) * m;
+            const float dm = GGML_FP16_TO_FP32(y[i].dmin) * m;
             for (int ii = 0; ii < 32; ++ii) {
                 int l = nearest_int((x[32*j + ii] + dm)/d);
                 l = MAX(0, MIN(15, l));
@@ -765,17 +1780,17 @@ void quantize_row_q4_K_reference(const float * restrict x, block_q4_K * restrict
         int m2 = nearest_int(inv_min*mins[1]);
         y[i].scales[0] = d1 | (m1 << 4);
         y[i].scales[1] = d2 | (m2 << 4);
-        y[i].d[0] = ggml_fp32_to_fp16(max_scale/s_factor);
-        y[i].d[1] = ggml_fp32_to_fp16(max_min/s_factor);
+        y[i].d[0] = GGML_FP32_TO_FP16(max_scale/s_factor);
+        y[i].d[1] = GGML_FP32_TO_FP16(max_min/s_factor);
 
         float sumlx = 0;
         int   suml2 = 0;
         for (int j = 0; j < QK_K/32; ++j) {
             const uint8_t sd = y[i].scales[j] & 0xF;
             const uint8_t sm = y[i].scales[j] >>  4;
-            const float d = ggml_fp16_to_fp32(y[i].d[0]) * sd;
+            const float d = GGML_FP16_TO_FP32(y[i].d[0]) * sd;
             if (!d) continue;
-            const float m = ggml_fp16_to_fp32(y[i].d[1]) * sm;
+            const float m = GGML_FP16_TO_FP32(y[i].d[1]) * sm;
             for (int ii = 0; ii < 32; ++ii) {
                 int l = nearest_int((x[32*j + ii] + m)/d);
                 l = MAX(0, MIN(15, l));
@@ -785,7 +1800,7 @@ void quantize_row_q4_K_reference(const float * restrict x, block_q4_K * restrict
             }
         }
         if (suml2) {
-            y[i].d[0] = ggml_fp32_to_fp16(sumlx/suml2);
+            y[i].d[0] = GGML_FP32_TO_FP16(sumlx/suml2);
         }
 #endif
         uint8_t * q = y[i].qs;
@@ -809,8 +1824,8 @@ void dequantize_row_q4_K(const block_q4_K * restrict x, float * restrict y, int
 
 #if QK_K == 256
 
-        const float d   = ggml_fp16_to_fp32(x[i].d);
-        const float min = ggml_fp16_to_fp32(x[i].dmin);
+        const float d   = GGML_FP16_TO_FP32(x[i].d);
+        const float min = GGML_FP16_TO_FP32(x[i].dmin);
 
         int is = 0;
         uint8_t sc, m;
@@ -824,8 +1839,8 @@ void dequantize_row_q4_K(const block_q4_K * restrict x, float * restrict y, int
             q += 32; is += 2;
         }
 #else
-        const float dall = ggml_fp16_to_fp32(x[i].d[0]);
-        const float mall = ggml_fp16_to_fp32(x[i].d[1]);
+        const float dall = GGML_FP16_TO_FP32(x[i].d[0]);
+        const float mall = GGML_FP16_TO_FP32(x[i].d[1]);
         const float d1 = dall * (x[i].scales[0] & 0xF), m1 = mall * (x[i].scales[0] >> 4);
         const float d2 = dall * (x[i].scales[1] & 0xF), m2 = mall * (x[i].scales[1] >> 4);
         for (int l = 0; l < 32; ++l) {
@@ -846,9 +1861,9 @@ void quantize_row_q4_K(const float * restrict x, void * restrict vy, int k) {
 
 size_t ggml_quantize_q4_K(const float * restrict src, void * restrict dst, int n, int k, int64_t * restrict hist) {
     assert(k % QK_K == 0);
-    const int nb = k / QK_K;
     (void)hist; // TODO: collect histograms
-    for (int j = 0; j < nb; j += k) {
+
+    for (int j = 0; j < n; j += k) {
         block_q4_K * restrict y = (block_q4_K *)dst + j/QK_K;
         quantize_row_q4_K_reference(src + j, y, k);
     }
@@ -911,15 +1926,15 @@ void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict
                 y[i].scales[j-0] |= ((lm >> 4) << 6);
             }
         }
-        y[i].d = ggml_fp32_to_fp16(max_scale/63.f);
-        y[i].dmin = ggml_fp32_to_fp16(max_min/63.f);
+        y[i].d = GGML_FP32_TO_FP16(max_scale/63.f);
+        y[i].dmin = GGML_FP32_TO_FP16(max_min/63.f);
 
         uint8_t sc, m;
         for (int j = 0; j < QK_K/32; ++j) {
             get_scale_min_k4(j, y[i].scales, &sc, &m);
-            const float d = ggml_fp16_to_fp32(y[i].d) * sc;
+            const float d = GGML_FP16_TO_FP32(y[i].d) * sc;
             if (!d) continue;
-            const float dm = ggml_fp16_to_fp32(y[i].dmin) * m;
+            const float dm = GGML_FP16_TO_FP32(y[i].dmin) * m;
             for (int ii = 0; ii < 32; ++ii) {
                 int l = nearest_int((x[32*j + ii] + dm)/d);
                 l = MAX(0, MIN(31, l));
@@ -963,10 +1978,10 @@ void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict
             int l = nearest_int(iscale*scales[j]);
             y[i].scales[j] = MAX(-128, MIN(127, l));
         }
-        y[i].d = ggml_fp32_to_fp16(1/iscale);
+        y[i].d = GGML_FP32_TO_FP16(1/iscale);
 
         for (int j = 0; j < QK_K/16; ++j) {
-            const float d = ggml_fp16_to_fp32(y[i].d) * y[i].scales[j];
+            const float d = GGML_FP16_TO_FP32(y[i].d) * y[i].scales[j];
             if (!d) continue;
             for (int ii = 0; ii < 16; ++ii) {
                 int l = nearest_int(x[16*j + ii]/d);
@@ -1010,8 +2025,8 @@ void dequantize_row_q5_K(const block_q5_K * restrict x, float * restrict y, int
 
 #if QK_K == 256
 
-        const float d = ggml_fp16_to_fp32(x[i].d);
-        const float min = ggml_fp16_to_fp32(x[i].dmin);
+        const float d = GGML_FP16_TO_FP32(x[i].d);
+        const float min = GGML_FP16_TO_FP32(x[i].dmin);
 
         int is = 0;
         uint8_t sc, m;
@@ -1027,7 +2042,7 @@ void dequantize_row_q5_K(const block_q5_K * restrict x, float * restrict y, int
             u1 <<= 2; u2 <<= 2;
         }
 #else
-        float d = ggml_fp16_to_fp32(x[i].d);
+        float d = GGML_FP16_TO_FP32(x[i].d);
         const int8_t * restrict s = x[i].scales;
         for (int l = 0; l < 8; ++l) {
             y[l+ 0] = d * s[0] * ((ql[l+ 0] & 0xF) - (qh[l] & 0x01 ? 0 : 16));
@@ -1052,9 +2067,9 @@ void quantize_row_q5_K(const float * restrict x, void * restrict vy, int k) {
 
 size_t ggml_quantize_q5_K(const float * restrict src, void * restrict dst, int n, int k, int64_t * restrict hist) {
     assert(k % QK_K == 0);
-    const int nb = k / QK_K;
-    (void)hist;
-    for (int j = 0; j < nb; j += k) {
+    (void)hist; // TODO: collect histograms
+
+    for (int j = 0; j < n; j += k) {
         block_q5_K * restrict y = (block_q5_K *)dst + j/QK_K;
         quantize_row_q5_K_reference(src + j, y, k);
     }
@@ -1090,19 +2105,19 @@ void quantize_row_q6_K_reference(const float * restrict x, block_q6_K * restrict
 
         if (!max_abs_scale) {
             memset(&y[i], 0, sizeof(block_q6_K));
-            y[i].d = ggml_fp32_to_fp16(0.f);
+            y[i].d = GGML_FP32_TO_FP16(0.f);
             x += QK_K;
             continue;
         }
 
         float iscale = -128.f/max_scale;
-        y[i].d = ggml_fp32_to_fp16(1/iscale);
+        y[i].d = GGML_FP32_TO_FP16(1/iscale);
         for (int ib = 0; ib < QK_K/16; ++ib) {
             y[i].scales[ib] = MIN(127, nearest_int(iscale*scales[ib]));
         }
 
         for (int j = 0; j < QK_K/16; ++j) {
-            float d = ggml_fp16_to_fp32(y[i].d) * y[i].scales[j];
+            float d = GGML_FP16_TO_FP32(y[i].d) * y[i].scales[j];
             if (!d) {
                 continue;
             }
@@ -1151,7 +2166,7 @@ void dequantize_row_q6_K(const block_q6_K * restrict x, float * restrict y, int
 
     for (int i = 0; i < nb; i++) {
 
-        const float d = ggml_fp16_to_fp32(x[i].d);
+        const float d = GGML_FP16_TO_FP32(x[i].d);
 
         const uint8_t * restrict ql = x[i].ql;
         const uint8_t * restrict qh = x[i].qh;
@@ -1200,11 +2215,9 @@ void quantize_row_q6_K(const float * restrict x, void * restrict vy, int k) {
 
 size_t ggml_quantize_q6_K(const float * src, void * dst, int n, int k, int64_t * hist) {
     assert(k % QK_K == 0);
-    const int nb = k / QK_K;
+    (void)hist; // TODO: collect histograms
 
-    (void)hist; // TODO
-
-    for (int j = 0; j < nb; j += k) {
+    for (int j = 0; j < n; j += k) {
         block_q6_K * restrict y = (block_q6_K *)dst + j/QK_K;
         quantize_row_q6_K_reference(src + j, y, k);
     }
@@ -1272,15 +2285,6 @@ void quantize_row_q8_K(const float * restrict x, void * restrict y, int k) {
 //
 #if __AVX__ || __AVX2__ || __AVX512F__
 
-// horizontally add 8 floats
-static inline float hsum_float_8(const __m256 x) {
-    __m128 res = _mm256_extractf128_ps(x, 1);
-    res = _mm_add_ps(res, _mm256_castps256_ps128(x));
-    res = _mm_add_ps(res, _mm_movehl_ps(res, res));
-    res = _mm_add_ss(res, _mm_movehdup_ps(res));
-    return _mm_cvtss_f32(res);
-}
-
 // shuffles to pick the required scales in dot products
 static inline __m256i get_scale_shuffle_q3k(int i) {
     static const uint8_t k_shuffle[128] = {
@@ -1319,6 +2323,1224 @@ static inline __m128i get_scale_shuffle(int i) {
 }
 #endif
 
+void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
+    const int qk = QK8_0;
+    const int nb = n / qk;
+
+    assert(n % qk == 0);
+
+    const block_q4_0 * restrict x = vx;
+    const block_q8_0 * restrict y = vy;
+
+#if defined(__ARM_NEON)
+    float32x4_t sumv0 = vdupq_n_f32(0.0f);
+    float32x4_t sumv1 = vdupq_n_f32(0.0f);
+
+    assert(nb % 2 == 0); // TODO: handle odd nb
+
+    for (int i = 0; i < nb; i += 2) {
+        const block_q4_0 * restrict x0 = &x[i + 0];
+        const block_q4_0 * restrict x1 = &x[i + 1];
+        const block_q8_0 * restrict y0 = &y[i + 0];
+        const block_q8_0 * restrict y1 = &y[i + 1];
+
+        const uint8x16_t m4b = vdupq_n_u8(0x0F);
+        const int8x16_t  s8b = vdupq_n_s8(0x8);
+
+        const uint8x16_t v0_0 = vld1q_u8(x0->qs);
+        const uint8x16_t v0_1 = vld1q_u8(x1->qs);
+
+        // 4-bit -> 8-bit
+        const int8x16_t v0_0l = vreinterpretq_s8_u8(vandq_u8  (v0_0, m4b));
+        const int8x16_t v0_0h = vreinterpretq_s8_u8(vshrq_n_u8(v0_0, 4));
+        const int8x16_t v0_1l = vreinterpretq_s8_u8(vandq_u8  (v0_1, m4b));
+        const int8x16_t v0_1h = vreinterpretq_s8_u8(vshrq_n_u8(v0_1, 4));
+
+        // sub 8
+        const int8x16_t v0_0ls = vsubq_s8(v0_0l, s8b);
+        const int8x16_t v0_0hs = vsubq_s8(v0_0h, s8b);
+        const int8x16_t v0_1ls = vsubq_s8(v0_1l, s8b);
+        const int8x16_t v0_1hs = vsubq_s8(v0_1h, s8b);
+
+        // load y
+        const int8x16_t v1_0l = vld1q_s8(y0->qs);
+        const int8x16_t v1_0h = vld1q_s8(y0->qs + 16);
+        const int8x16_t v1_1l = vld1q_s8(y1->qs);
+        const int8x16_t v1_1h = vld1q_s8(y1->qs + 16);
+
+#if defined(__ARM_FEATURE_DOTPROD)
+        // dot product into int32x4_t
+        const int32x4_t p_0 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_0ls, v1_0l), v0_0hs, v1_0h);
+        const int32x4_t p_1 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_1ls, v1_1l), v0_1hs, v1_1h);
+
+        sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(p_0), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
+        sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(p_1), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
+#else
+        const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0_0ls), vget_low_s8 (v1_0l));
+        const int16x8_t pl0h = vmull_s8(vget_high_s8(v0_0ls), vget_high_s8(v1_0l));
+        const int16x8_t ph0l = vmull_s8(vget_low_s8 (v0_0hs), vget_low_s8 (v1_0h));
+        const int16x8_t ph0h = vmull_s8(vget_high_s8(v0_0hs), vget_high_s8(v1_0h));
+
+        const int16x8_t pl1l = vmull_s8(vget_low_s8 (v0_1ls), vget_low_s8 (v1_1l));
+        const int16x8_t pl1h = vmull_s8(vget_high_s8(v0_1ls), vget_high_s8(v1_1l));
+        const int16x8_t ph1l = vmull_s8(vget_low_s8 (v0_1hs), vget_low_s8 (v1_1h));
+        const int16x8_t ph1h = vmull_s8(vget_high_s8(v0_1hs), vget_high_s8(v1_1h));
+
+        const int32x4_t pl0 = vaddq_s32(vpaddlq_s16(pl0l), vpaddlq_s16(pl0h));
+        const int32x4_t ph0 = vaddq_s32(vpaddlq_s16(ph0l), vpaddlq_s16(ph0h));
+        const int32x4_t pl1 = vaddq_s32(vpaddlq_s16(pl1l), vpaddlq_s16(pl1h));
+        const int32x4_t ph1 = vaddq_s32(vpaddlq_s16(ph1l), vpaddlq_s16(ph1h));
+
+        sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(pl0, ph0)), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
+        sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(pl1, ph1)), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
+#endif
+    }
+
+    *s = vaddvq_f32(sumv0) + vaddvq_f32(sumv1);
+#elif defined(__AVX2__)
+    // Initialize accumulator with zeros
+    __m256 acc = _mm256_setzero_ps();
+
+    // Main loop
+    for (int i = 0; i < nb; ++i) {
+        /* Compute combined scale for the block */
+        const __m256 d = _mm256_set1_ps( GGML_FP16_TO_FP32(x[i].d) * GGML_FP16_TO_FP32(y[i].d) );
+
+        __m256i bx = bytes_from_nibbles_32(x[i].qs);
+
+        // Now we have a vector with bytes in [ 0 .. 15 ] interval. Offset them into [ -8 .. +7 ] interval.
+        const __m256i off = _mm256_set1_epi8( 8 );
+        bx = _mm256_sub_epi8( bx, off );
+
+        __m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs);
+
+        const __m256 q = mul_sum_i8_pairs_float(bx, by);
+
+        /* Multiply q with scale and accumulate */
+        acc = _mm256_fmadd_ps( d, q, acc );
+    }
+
+    *s = hsum_float_8(acc);
+#elif defined(__AVX__)
+    // Initialize accumulator with zeros
+    __m256 acc = _mm256_setzero_ps();
+
+    // Main loop
+    for (int i = 0; i < nb; ++i) {
+        // Compute combined scale for the block
+        const __m256 d = _mm256_set1_ps( GGML_FP16_TO_FP32(x[i].d) * GGML_FP16_TO_FP32(y[i].d) );
+
+        const __m128i lowMask = _mm_set1_epi8(0xF);
+        const __m128i off = _mm_set1_epi8(8);
+
+        const __m128i tmp = _mm_loadu_si128((const __m128i *)x[i].qs);
+
+        __m128i bx = _mm_and_si128(lowMask, tmp);
+        __m128i by = _mm_loadu_si128((const __m128i *)y[i].qs);
+        bx = _mm_sub_epi8(bx, off);
+        const __m128i i32_0 = mul_sum_i8_pairs(bx, by);
+
+        bx = _mm_and_si128(lowMask, _mm_srli_epi64(tmp, 4));
+        by = _mm_loadu_si128((const __m128i *)(y[i].qs + 16));
+        bx = _mm_sub_epi8(bx, off);
+        const __m128i i32_1 = mul_sum_i8_pairs(bx, by);
+
+        // Convert int32_t to float
+        __m256 p = _mm256_cvtepi32_ps(MM256_SET_M128I(i32_0, i32_1));
+
+        // Apply the scale, and accumulate
+        acc = _mm256_add_ps(_mm256_mul_ps( d, p ), acc);
+    }
+
+    *s = hsum_float_8(acc);
+#elif defined(__SSSE3__)
+    // set constants
+    const __m128i lowMask = _mm_set1_epi8(0xF);
+    const __m128i off = _mm_set1_epi8(8);
+
+    // Initialize accumulator with zeros
+    __m128 acc_0 = _mm_setzero_ps();
+    __m128 acc_1 = _mm_setzero_ps();
+    __m128 acc_2 = _mm_setzero_ps();
+    __m128 acc_3 = _mm_setzero_ps();
+
+    // First round without accumulation
+    {
+        _mm_prefetch(&x[0] + sizeof(block_q4_0), _MM_HINT_T0);
+        _mm_prefetch(&y[0] + sizeof(block_q8_0), _MM_HINT_T0);
+
+        // Compute combined scale for the block 0 and 1
+        const __m128 d_0_1 = _mm_set1_ps( GGML_FP16_TO_FP32(x[0].d) * GGML_FP16_TO_FP32(y[0].d) );
+
+        const __m128i tmp_0_1 = _mm_loadu_si128((const __m128i *)x[0].qs);
+
+        __m128i bx_0 = _mm_and_si128(lowMask, tmp_0_1);
+        __m128i by_0 = _mm_loadu_si128((const __m128i *)y[0].qs);
+        bx_0 = _mm_sub_epi8(bx_0, off);
+        const __m128i i32_0 = mul_sum_i8_pairs(bx_0, by_0);
+
+        __m128i bx_1 = _mm_and_si128(lowMask, _mm_srli_epi64(tmp_0_1, 4));
+        __m128i by_1 = _mm_loadu_si128((const __m128i *)(y[0].qs + 16));
+        bx_1 = _mm_sub_epi8(bx_1, off);
+        const __m128i i32_1 = mul_sum_i8_pairs(bx_1, by_1);
+
+        _mm_prefetch(&x[1] + sizeof(block_q4_0), _MM_HINT_T0);
+        _mm_prefetch(&y[1] + sizeof(block_q8_0), _MM_HINT_T0);
+
+        // Compute combined scale for the block 2 and 3
+        const __m128 d_2_3 = _mm_set1_ps( GGML_FP16_TO_FP32(x[1].d) * GGML_FP16_TO_FP32(y[1].d) );
+
+        const __m128i tmp_2_3 = _mm_loadu_si128((const __m128i *)x[1].qs);
+
+        __m128i bx_2 = _mm_and_si128(lowMask, tmp_2_3);
+        __m128i by_2 = _mm_loadu_si128((const __m128i *)y[1].qs);
+        bx_2 = _mm_sub_epi8(bx_2, off);
+        const __m128i i32_2 = mul_sum_i8_pairs(bx_2, by_2);
+
+        __m128i bx_3 = _mm_and_si128(lowMask, _mm_srli_epi64(tmp_2_3, 4));
+        __m128i by_3 = _mm_loadu_si128((const __m128i *)(y[1].qs + 16));
+        bx_3 = _mm_sub_epi8(bx_3, off);
+        const __m128i i32_3 = mul_sum_i8_pairs(bx_3, by_3);
+
+        // Convert int32_t to float
+        __m128 p0 = _mm_cvtepi32_ps(i32_0);
+        __m128 p1 = _mm_cvtepi32_ps(i32_1);
+        __m128 p2 = _mm_cvtepi32_ps(i32_2);
+        __m128 p3 = _mm_cvtepi32_ps(i32_3);
+
+        // Apply the scale
+        acc_0 = _mm_mul_ps( d_0_1, p0 );
+        acc_1 = _mm_mul_ps( d_0_1, p1 );
+        acc_2 = _mm_mul_ps( d_2_3, p2 );
+        acc_3 = _mm_mul_ps( d_2_3, p3 );
+    }
+
+    assert(nb % 2 == 0); // TODO: handle odd nb
+
+    // Main loop
+    for (int i = 2; i < nb; i+=2) {
+        _mm_prefetch(&x[i] + sizeof(block_q4_0), _MM_HINT_T0);
+        _mm_prefetch(&y[i] + sizeof(block_q8_0), _MM_HINT_T0);
+
+        // Compute combined scale for the block 0 and 1
+        const __m128 d_0_1 = _mm_set1_ps( GGML_FP16_TO_FP32(x[i].d) * GGML_FP16_TO_FP32(y[i].d) );
+
+        const __m128i tmp_0_1 = _mm_loadu_si128((const __m128i *)x[i].qs);
+
+        __m128i bx_0 = _mm_and_si128(lowMask, tmp_0_1);
+        __m128i by_0 = _mm_loadu_si128((const __m128i *)y[i].qs);
+        bx_0 = _mm_sub_epi8(bx_0, off);
+        const __m128i i32_0 = mul_sum_i8_pairs(bx_0, by_0);
+
+        __m128i bx_1 = _mm_and_si128(lowMask, _mm_srli_epi64(tmp_0_1, 4));
+        __m128i by_1 = _mm_loadu_si128((const __m128i *)(y[i].qs + 16));
+        bx_1 = _mm_sub_epi8(bx_1, off);
+        const __m128i i32_1 = mul_sum_i8_pairs(bx_1, by_1);
+
+        _mm_prefetch(&x[i] + 2 * sizeof(block_q4_0), _MM_HINT_T0);
+        _mm_prefetch(&y[i] + 2 * sizeof(block_q8_0), _MM_HINT_T0);
+
+        // Compute combined scale for the block 2 and 3
+        const __m128 d_2_3 = _mm_set1_ps( GGML_FP16_TO_FP32(x[i + 1].d) * GGML_FP16_TO_FP32(y[i + 1].d) );
+
+        const __m128i tmp_2_3 = _mm_loadu_si128((const __m128i *)x[i + 1].qs);
+
+        __m128i bx_2 = _mm_and_si128(lowMask, tmp_2_3);
+        __m128i by_2 = _mm_loadu_si128((const __m128i *)y[i + 1].qs);
+        bx_2 = _mm_sub_epi8(bx_2, off);
+        const __m128i i32_2 = mul_sum_i8_pairs(bx_2, by_2);
+
+        __m128i bx_3 = _mm_and_si128(lowMask, _mm_srli_epi64(tmp_2_3, 4));
+        __m128i by_3 = _mm_loadu_si128((const __m128i *)(y[i + 1].qs + 16));
+        bx_3 = _mm_sub_epi8(bx_3, off);
+        const __m128i i32_3 = mul_sum_i8_pairs(bx_3, by_3);
+
+        // Convert int32_t to float
+        __m128 p0 = _mm_cvtepi32_ps(i32_0);
+        __m128 p1 = _mm_cvtepi32_ps(i32_1);
+        __m128 p2 = _mm_cvtepi32_ps(i32_2);
+        __m128 p3 = _mm_cvtepi32_ps(i32_3);
+
+        // Apply the scale
+        __m128 p0_d = _mm_mul_ps( d_0_1, p0 );
+        __m128 p1_d = _mm_mul_ps( d_0_1, p1 );
+        __m128 p2_d = _mm_mul_ps( d_2_3, p2 );
+        __m128 p3_d = _mm_mul_ps( d_2_3, p3 );
+
+        // Acummulate
+        acc_0 = _mm_add_ps(p0_d, acc_0);
+        acc_1 = _mm_add_ps(p1_d, acc_1);
+        acc_2 = _mm_add_ps(p2_d, acc_2);
+        acc_3 = _mm_add_ps(p3_d, acc_3);
+    }
+
+    *s = hsum_float_4x4(acc_0, acc_1, acc_2, acc_3);
+#elif defined(__riscv_v_intrinsic)
+    float sumf = 0.0;
+
+    size_t vl = __riscv_vsetvl_e8m1(qk/2);
+
+    for (int i = 0; i < nb; i++) {
+        // load elements
+        vuint8mf2_t tx = __riscv_vle8_v_u8mf2(x[i].qs, vl);
+
+        vint8mf2_t y0 = __riscv_vle8_v_i8mf2(y[i].qs, vl);
+        vint8mf2_t y1 = __riscv_vle8_v_i8mf2(y[i].qs+16, vl);
+
+        // mask and store lower part of x, and then upper part
+        vuint8mf2_t x_a = __riscv_vand_vx_u8mf2(tx, 0x0F, vl);
+        vuint8mf2_t x_l = __riscv_vsrl_vx_u8mf2(tx, 0x04, vl);
+
+        vint8mf2_t x_ai = __riscv_vreinterpret_v_u8mf2_i8mf2(x_a);
+        vint8mf2_t x_li = __riscv_vreinterpret_v_u8mf2_i8mf2(x_l);
+
+        // subtract offset
+        vint8mf2_t v0 = __riscv_vsub_vx_i8mf2(x_ai, 8, vl);
+        vint8mf2_t v1 = __riscv_vsub_vx_i8mf2(x_li, 8, vl);
+
+        vint16m1_t vec_mul1 = __riscv_vwmul_vv_i16m1(v0, y0, vl);
+        vint16m1_t vec_mul2 = __riscv_vwmul_vv_i16m1(v1, y1, vl);
+
+        vint32m1_t vec_zero = __riscv_vmv_v_x_i32m1(0, vl);
+
+        vint32m1_t vs1 = __riscv_vwredsum_vs_i16m1_i32m1(vec_mul1, vec_zero, vl);
+        vint32m1_t vs2 = __riscv_vwredsum_vs_i16m1_i32m1(vec_mul2, vs1, vl);
+
+        int sumi = __riscv_vmv_x_s_i32m1_i32(vs2);
+
+        sumf += sumi*GGML_FP16_TO_FP32(x[i].d)*GGML_FP16_TO_FP32(y[i].d);
+    }
+
+    *s = sumf;
+#else
+    // scalar
+    float sumf = 0.0;
+
+    for (int i = 0; i < nb; i++) {
+        int sumi = 0;
+
+        for (int j = 0; j < qk/2; ++j) {
+            const int v0 = (x[i].qs[j] & 0x0F) - 8;
+            const int v1 = (x[i].qs[j] >>   4) - 8;
+
+            sumi += (v0 * y[i].qs[j]) + (v1 * y[i].qs[j + qk/2]);
+        }
+
+        sumf += sumi*GGML_FP16_TO_FP32(x[i].d)*GGML_FP16_TO_FP32(y[i].d);
+    }
+
+    *s = sumf;
+#endif
+}
+
+void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
+    const int qk = QK8_1;
+    const int nb = n / qk;
+
+    assert(n % qk == 0);
+
+    const block_q4_1 * restrict x = vx;
+    const block_q8_1 * restrict y = vy;
+
+    // TODO: add WASM SIMD
+#if defined(__ARM_NEON)
+    float32x4_t sumv0 = vdupq_n_f32(0.0f);
+    float32x4_t sumv1 = vdupq_n_f32(0.0f);
+
+    float summs = 0;
+
+    assert(nb % 2 == 0); // TODO: handle odd nb
+
+    for (int i = 0; i < nb; i += 2) {
+        const block_q4_1 * restrict x0 = &x[i + 0];
+        const block_q4_1 * restrict x1 = &x[i + 1];
+        const block_q8_1 * restrict y0 = &y[i + 0];
+        const block_q8_1 * restrict y1 = &y[i + 1];
+
+        summs += GGML_FP16_TO_FP32(x0->m) * y0->s + GGML_FP16_TO_FP32(x1->m) * y1->s;
+
+        const uint8x16_t m4b = vdupq_n_u8(0x0F);
+
+        const uint8x16_t v0_0 = vld1q_u8(x0->qs);
+        const uint8x16_t v0_1 = vld1q_u8(x1->qs);
+
+        // 4-bit -> 8-bit
+        const int8x16_t v0_0l = vreinterpretq_s8_u8(vandq_u8  (v0_0, m4b));
+        const int8x16_t v0_0h = vreinterpretq_s8_u8(vshrq_n_u8(v0_0, 4));
+        const int8x16_t v0_1l = vreinterpretq_s8_u8(vandq_u8  (v0_1, m4b));
+        const int8x16_t v0_1h = vreinterpretq_s8_u8(vshrq_n_u8(v0_1, 4));
+
+        // load y
+        const int8x16_t v1_0l = vld1q_s8(y0->qs);
+        const int8x16_t v1_0h = vld1q_s8(y0->qs + 16);
+        const int8x16_t v1_1l = vld1q_s8(y1->qs);
+        const int8x16_t v1_1h = vld1q_s8(y1->qs + 16);
+
+#if defined(__ARM_FEATURE_DOTPROD)
+        // dot product into int32x4_t
+        const int32x4_t p_0 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_0l, v1_0l), v0_0h, v1_0h);
+        const int32x4_t p_1 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_1l, v1_1l), v0_1h, v1_1h);
+
+        sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(p_0), GGML_FP16_TO_FP32(x0->d)*y0->d);
+        sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(p_1), GGML_FP16_TO_FP32(x1->d)*y1->d);
+#else
+        const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0_0l), vget_low_s8 (v1_0l));
+        const int16x8_t pl0h = vmull_s8(vget_high_s8(v0_0l), vget_high_s8(v1_0l));
+        const int16x8_t ph0l = vmull_s8(vget_low_s8 (v0_0h), vget_low_s8 (v1_0h));
+        const int16x8_t ph0h = vmull_s8(vget_high_s8(v0_0h), vget_high_s8(v1_0h));
+
+        const int16x8_t pl1l = vmull_s8(vget_low_s8 (v0_1l), vget_low_s8 (v1_1l));
+        const int16x8_t pl1h = vmull_s8(vget_high_s8(v0_1l), vget_high_s8(v1_1l));
+        const int16x8_t ph1l = vmull_s8(vget_low_s8 (v0_1h), vget_low_s8 (v1_1h));
+        const int16x8_t ph1h = vmull_s8(vget_high_s8(v0_1h), vget_high_s8(v1_1h));
+
+        const int32x4_t pl0 = vaddq_s32(vpaddlq_s16(pl0l), vpaddlq_s16(pl0h));
+        const int32x4_t ph0 = vaddq_s32(vpaddlq_s16(ph0l), vpaddlq_s16(ph0h));
+        const int32x4_t pl1 = vaddq_s32(vpaddlq_s16(pl1l), vpaddlq_s16(pl1h));
+        const int32x4_t ph1 = vaddq_s32(vpaddlq_s16(ph1l), vpaddlq_s16(ph1h));
+
+        sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(pl0, ph0)), GGML_FP16_TO_FP32(x0->d)*y0->d);
+        sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(pl1, ph1)), GGML_FP16_TO_FP32(x1->d)*y1->d);
+#endif
+    }
+
+    *s = vaddvq_f32(sumv0) + vaddvq_f32(sumv1) + summs;
+#elif defined(__AVX2__) || defined(__AVX__)
+    // Initialize accumulator with zeros
+    __m256 acc = _mm256_setzero_ps();
+
+    float summs = 0;
+
+    // Main loop
+    for (int i = 0; i < nb; ++i) {
+        const float d0 = GGML_FP16_TO_FP32(x[i].d);
+        const float d1 = y[i].d;
+
+        summs += GGML_FP16_TO_FP32(x[i].m) * y[i].s;
+
+        const __m256 d0v = _mm256_set1_ps( d0 );
+        const __m256 d1v = _mm256_set1_ps( d1 );
+
+        // Compute combined scales
+        const __m256 d0d1 = _mm256_mul_ps( d0v, d1v );
+
+        // Load 16 bytes, and unpack 4 bit fields into bytes, making 32 bytes
+        const __m256i bx = bytes_from_nibbles_32(x[i].qs);
+        const __m256i by = _mm256_loadu_si256( (const __m256i *)y[i].qs );
+
+        const __m256 xy = mul_sum_us8_pairs_float(bx, by);
+
+        // Accumulate d0*d1*x*y
+#if defined(__AVX2__)
+        acc = _mm256_fmadd_ps( d0d1, xy, acc );
+#else
+        acc = _mm256_add_ps( _mm256_mul_ps( d0d1, xy ), acc );
+#endif
+    }
+
+    *s = hsum_float_8(acc) + summs;
+#elif defined(__riscv_v_intrinsic)
+    float sumf = 0.0;
+
+    size_t vl = __riscv_vsetvl_e8m1(qk/2);
+
+    for (int i = 0; i < nb; i++) {
+        // load elements
+        vuint8mf2_t tx = __riscv_vle8_v_u8mf2(x[i].qs, vl);
+
+        vint8mf2_t y0 = __riscv_vle8_v_i8mf2(y[i].qs, vl);
+        vint8mf2_t y1 = __riscv_vle8_v_i8mf2(y[i].qs+16, vl);
+
+        // mask and store lower part of x, and then upper part
+        vuint8mf2_t x_a = __riscv_vand_vx_u8mf2(tx, 0x0F, vl);
+        vuint8mf2_t x_l = __riscv_vsrl_vx_u8mf2(tx, 0x04, vl);
+
+        vint8mf2_t v0 = __riscv_vreinterpret_v_u8mf2_i8mf2(x_a);
+        vint8mf2_t v1 = __riscv_vreinterpret_v_u8mf2_i8mf2(x_l);
+
+        vint16m1_t vec_mul1 = __riscv_vwmul_vv_i16m1(v0, y0, vl);
+        vint16m1_t vec_mul2 = __riscv_vwmul_vv_i16m1(v1, y1, vl);
+
+        vint32m1_t vec_zero = __riscv_vmv_v_x_i32m1(0, vl);
+
+        vint32m1_t vs1 = __riscv_vwredsum_vs_i16m1_i32m1(vec_mul1, vec_zero, vl);
+        vint32m1_t vs2 = __riscv_vwredsum_vs_i16m1_i32m1(vec_mul2, vs1, vl);
+
+        int sumi = __riscv_vmv_x_s_i32m1_i32(vs2);
+
+        sumf += (GGML_FP16_TO_FP32(x[i].d)*y[i].d)*sumi + GGML_FP16_TO_FP32(x[i].m)*y[i].s;
+    }
+
+    *s = sumf;
+#else
+    // scalar
+    float sumf = 0.0;
+
+    for (int i = 0; i < nb; i++) {
+        int sumi = 0;
+
+        for (int j = 0; j < qk/2; ++j) {
+            const int v0 = (x[i].qs[j] & 0x0F);
+            const int v1 = (x[i].qs[j] >>   4);
+
+            sumi += (v0 * y[i].qs[j]) + (v1 * y[i].qs[j + qk/2]);
+        }
+
+        sumf += (GGML_FP16_TO_FP32(x[i].d)*y[i].d)*sumi + GGML_FP16_TO_FP32(x[i].m)*y[i].s;
+    }
+
+    *s = sumf;
+#endif
+}
+
+void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
+    const int qk = QK8_0;
+    const int nb = n / qk;
+
+    assert(n % qk == 0);
+    assert(qk == QK5_0);
+
+    const block_q5_0 * restrict x = vx;
+    const block_q8_0 * restrict y = vy;
+
+#if defined(__ARM_NEON)
+    float32x4_t sumv0 = vdupq_n_f32(0.0f);
+    float32x4_t sumv1 = vdupq_n_f32(0.0f);
+
+    uint32_t qh0;
+    uint32_t qh1;
+
+    uint64_t tmp0[4];
+    uint64_t tmp1[4];
+
+    assert(nb % 2 == 0); // TODO: handle odd nb
+
+    for (int i = 0; i < nb; i += 2) {
+        const block_q5_0 * restrict x0 = &x[i];
+        const block_q5_0 * restrict x1 = &x[i + 1];
+        const block_q8_0 * restrict y0 = &y[i];
+        const block_q8_0 * restrict y1 = &y[i + 1];
+
+        const uint8x16_t m4b = vdupq_n_u8(0x0F);
+
+        // extract the 5th bit via lookup table ((!b) << 4)
+        memcpy(&qh0, x0->qh, sizeof(qh0));
+        memcpy(&qh1, x1->qh, sizeof(qh1));
+
+        tmp0[0] = table_b2b_1[(qh0 >>  0) & 0xFF];
+        tmp0[1] = table_b2b_1[(qh0 >>  8) & 0xFF];
+        tmp0[2] = table_b2b_1[(qh0 >> 16) & 0xFF];
+        tmp0[3] = table_b2b_1[(qh0 >> 24)       ];
+
+        tmp1[0] = table_b2b_1[(qh1 >>  0) & 0xFF];
+        tmp1[1] = table_b2b_1[(qh1 >>  8) & 0xFF];
+        tmp1[2] = table_b2b_1[(qh1 >> 16) & 0xFF];
+        tmp1[3] = table_b2b_1[(qh1 >> 24)       ];
+
+        const int8x16_t qhl0 = vld1q_s8((const int8_t *)(tmp0 + 0));
+        const int8x16_t qhh0 = vld1q_s8((const int8_t *)(tmp0 + 2));
+        const int8x16_t qhl1 = vld1q_s8((const int8_t *)(tmp1 + 0));
+        const int8x16_t qhh1 = vld1q_s8((const int8_t *)(tmp1 + 2));
+
+        const uint8x16_t v0_0 = vld1q_u8(x0->qs);
+        const uint8x16_t v0_1 = vld1q_u8(x1->qs);
+
+        // 4-bit -> 8-bit
+        int8x16_t v0_0l = vreinterpretq_s8_u8(vandq_u8  (v0_0, m4b));
+        int8x16_t v0_0h = vreinterpretq_s8_u8(vshrq_n_u8(v0_0, 4));
+        int8x16_t v0_1l = vreinterpretq_s8_u8(vandq_u8  (v0_1, m4b));
+        int8x16_t v0_1h = vreinterpretq_s8_u8(vshrq_n_u8(v0_1, 4));
+
+        // add high bit and sub 16 (equivalent to sub 0x10 when bit is zero)
+        const int8x16_t v0_0lf = vsubq_s8(v0_0l, qhl0);
+        const int8x16_t v0_0hf = vsubq_s8(v0_0h, qhh0);
+        const int8x16_t v0_1lf = vsubq_s8(v0_1l, qhl1);
+        const int8x16_t v0_1hf = vsubq_s8(v0_1h, qhh1);
+
+        // load y
+        const int8x16_t v1_0l = vld1q_s8(y0->qs);
+        const int8x16_t v1_0h = vld1q_s8(y0->qs + 16);
+        const int8x16_t v1_1l = vld1q_s8(y1->qs);
+        const int8x16_t v1_1h = vld1q_s8(y1->qs + 16);
+
+#if defined(__ARM_FEATURE_DOTPROD)
+        sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(
+                        vdotq_s32(vdupq_n_s32(0), v0_0lf, v1_0l),
+                        vdotq_s32(vdupq_n_s32(0), v0_0hf, v1_0h))), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
+        sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(
+                        vdotq_s32(vdupq_n_s32(0), v0_1lf, v1_1l),
+                        vdotq_s32(vdupq_n_s32(0), v0_1hf, v1_1h))), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
+#else
+        const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0_0lf), vget_low_s8 (v1_0l));
+        const int16x8_t pl0h = vmull_s8(vget_high_s8(v0_0lf), vget_high_s8(v1_0l));
+        const int16x8_t ph0l = vmull_s8(vget_low_s8 (v0_0hf), vget_low_s8 (v1_0h));
+        const int16x8_t ph0h = vmull_s8(vget_high_s8(v0_0hf), vget_high_s8(v1_0h));
+
+        const int16x8_t pl1l = vmull_s8(vget_low_s8 (v0_1lf), vget_low_s8 (v1_1l));
+        const int16x8_t pl1h = vmull_s8(vget_high_s8(v0_1lf), vget_high_s8(v1_1l));
+        const int16x8_t ph1l = vmull_s8(vget_low_s8 (v0_1hf), vget_low_s8 (v1_1h));
+        const int16x8_t ph1h = vmull_s8(vget_high_s8(v0_1hf), vget_high_s8(v1_1h));
+
+        const int32x4_t pl0 = vaddq_s32(vpaddlq_s16(pl0l), vpaddlq_s16(pl0h));
+        const int32x4_t ph0 = vaddq_s32(vpaddlq_s16(ph0l), vpaddlq_s16(ph0h));
+        const int32x4_t pl1 = vaddq_s32(vpaddlq_s16(pl1l), vpaddlq_s16(pl1h));
+        const int32x4_t ph1 = vaddq_s32(vpaddlq_s16(ph1l), vpaddlq_s16(ph1h));
+
+        sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(pl0, ph0)), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
+        sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(pl1, ph1)), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
+#endif
+    }
+
+    *s = vaddvq_f32(sumv0) + vaddvq_f32(sumv1);
+#elif defined(__wasm_simd128__)
+    v128_t sumv = wasm_f32x4_splat(0.0f);
+
+    uint32_t qh;
+    uint64_t tmp[4];
+
+    // TODO: check if unrolling this is better
+    for (int i = 0; i < nb; ++i) {
+        const block_q5_0 * restrict x0 = &x[i];
+        const block_q8_0 * restrict y0 = &y[i];
+
+        const v128_t m4b  = wasm_i8x16_splat(0x0F);
+
+        // extract the 5th bit
+        memcpy(&qh, x0->qh, sizeof(qh));
+
+        tmp[0] = table_b2b_1[(qh >>  0) & 0xFF];
+        tmp[1] = table_b2b_1[(qh >>  8) & 0xFF];
+        tmp[2] = table_b2b_1[(qh >> 16) & 0xFF];
+        tmp[3] = table_b2b_1[(qh >> 24)       ];
+
+        const v128_t qhl = wasm_v128_load(tmp + 0);
+        const v128_t qhh = wasm_v128_load(tmp + 2);
+
+        const v128_t v0 = wasm_v128_load(x0->qs);
+
+        // 4-bit -> 8-bit
+        const v128_t v0l = wasm_v128_and (v0, m4b);
+        const v128_t v0h = wasm_u8x16_shr(v0, 4);
+
+        // add high bit and sub 16 (equivalent to sub 0x10 when bit is zero)
+        const v128_t v0lf = wasm_i8x16_sub(v0l, qhl);
+        const v128_t v0hf = wasm_i8x16_sub(v0h, qhh);
+
+        // load y
+        const v128_t v1l = wasm_v128_load(y0->qs);
+        const v128_t v1h = wasm_v128_load(y0->qs + 16);
+
+        // int8x16 -> int16x8
+        const v128_t v0lfl = wasm_i16x8_extend_low_i8x16 (v0lf);
+        const v128_t v0lfh = wasm_i16x8_extend_high_i8x16(v0lf);
+        const v128_t v0hfl = wasm_i16x8_extend_low_i8x16 (v0hf);
+        const v128_t v0hfh = wasm_i16x8_extend_high_i8x16(v0hf);
+
+        const v128_t v1ll = wasm_i16x8_extend_low_i8x16 (v1l);
+        const v128_t v1lh = wasm_i16x8_extend_high_i8x16(v1l);
+        const v128_t v1hl = wasm_i16x8_extend_low_i8x16 (v1h);
+        const v128_t v1hh = wasm_i16x8_extend_high_i8x16(v1h);
+
+        // dot product
+        sumv = wasm_f32x4_add(sumv, wasm_f32x4_mul(wasm_f32x4_convert_i32x4(
+                        wasm_i32x4_add(
+                            wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0lfl, v1ll),
+                                           wasm_i32x4_dot_i16x8(v0lfh, v1lh)),
+                            wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0hfl, v1hl),
+                                           wasm_i32x4_dot_i16x8(v0hfh, v1hh)))),
+                    wasm_f32x4_splat(GGML_FP16_TO_FP32(x0->d) * GGML_FP16_TO_FP32(y0->d))));
+    }
+
+    *s = wasm_f32x4_extract_lane(sumv, 0) + wasm_f32x4_extract_lane(sumv, 1) +
+         wasm_f32x4_extract_lane(sumv, 2) + wasm_f32x4_extract_lane(sumv, 3);
+#elif defined(__AVX2__)
+    // Initialize accumulator with zeros
+    __m256 acc = _mm256_setzero_ps();
+
+    // Main loop
+    for (int i = 0; i < nb; i++) {
+        /* Compute combined scale for the block */
+        const __m256 d = _mm256_set1_ps(GGML_FP16_TO_FP32(x[i].d) * GGML_FP16_TO_FP32(y[i].d));
+
+        __m256i bx = bytes_from_nibbles_32(x[i].qs);
+        __m256i bxhi = bytes_from_bits_32(x[i].qh);
+        bxhi = _mm256_andnot_si256(bxhi, _mm256_set1_epi8((char)0xF0));
+        bx = _mm256_or_si256(bx, bxhi);
+
+        __m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs);
+
+        const __m256 q = mul_sum_i8_pairs_float(bx, by);
+
+        /* Multiply q with scale and accumulate */
+        acc = _mm256_fmadd_ps(d, q, acc);
+    }
+
+    *s = hsum_float_8(acc);
+#elif defined(__AVX__)
+    // Initialize accumulator with zeros
+    __m256 acc = _mm256_setzero_ps();
+    __m128i mask = _mm_set1_epi8((char)0xF0);
+
+    // Main loop
+    for (int i = 0; i < nb; i++) {
+        /* Compute combined scale for the block */
+        const __m256 d = _mm256_set1_ps(GGML_FP16_TO_FP32(x[i].d) * GGML_FP16_TO_FP32(y[i].d));
+
+        __m256i bx = bytes_from_nibbles_32(x[i].qs);
+        const __m256i bxhi = bytes_from_bits_32(x[i].qh);
+        __m128i bxhil = _mm256_castsi256_si128(bxhi);
+        __m128i bxhih = _mm256_extractf128_si256(bxhi, 1);
+        bxhil = _mm_andnot_si128(bxhil, mask);
+        bxhih = _mm_andnot_si128(bxhih, mask);
+        __m128i bxl = _mm256_castsi256_si128(bx);
+        __m128i bxh = _mm256_extractf128_si256(bx, 1);
+        bxl = _mm_or_si128(bxl, bxhil);
+        bxh = _mm_or_si128(bxh, bxhih);
+        bx = MM256_SET_M128I(bxh, bxl);
+
+        const __m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs);
+
+        const __m256 q = mul_sum_i8_pairs_float(bx, by);
+
+        /* Multiply q with scale and accumulate */
+        acc = _mm256_add_ps(_mm256_mul_ps(d, q), acc);
+    }
+
+    *s = hsum_float_8(acc);
+#elif defined(__riscv_v_intrinsic)
+    float sumf = 0.0;
+
+    uint32_t qh;
+
+    size_t vl = __riscv_vsetvl_e8m1(qk/2);
+
+    // These tempory registers are for masking and shift operations
+    vuint32m2_t vt_1 = __riscv_vid_v_u32m2(vl);
+    vuint32m2_t vt_2 = __riscv_vsll_vv_u32m2(__riscv_vmv_v_x_u32m2(1, vl), vt_1, vl);
+
+    vuint32m2_t vt_3 = __riscv_vsll_vx_u32m2(vt_2, 16, vl);
+    vuint32m2_t vt_4 = __riscv_vadd_vx_u32m2(vt_1, 12, vl);
+
+    for (int i = 0; i < nb; i++) {
+        memcpy(&qh, x[i].qh, sizeof(uint32_t));
+
+        // ((qh & (1u << (j + 0 ))) >> (j + 0 )) << 4;
+        vuint32m2_t xha_0 = __riscv_vand_vx_u32m2(vt_2, qh, vl);
+        vuint32m2_t xhr_0 = __riscv_vsrl_vv_u32m2(xha_0, vt_1, vl);
+        vuint32m2_t xhl_0 = __riscv_vsll_vx_u32m2(xhr_0, 4, vl);
+
+        // ((qh & (1u << (j + 16))) >> (j + 12));
+        vuint32m2_t xha_1 = __riscv_vand_vx_u32m2(vt_3, qh, vl);
+        vuint32m2_t xhl_1 = __riscv_vsrl_vv_u32m2(xha_1, vt_4, vl);
+
+        // narrowing
+        vuint16m1_t xhc_0 = __riscv_vncvt_x_x_w_u16m1(xhl_0, vl);
+        vuint8mf2_t xh_0 = __riscv_vncvt_x_x_w_u8mf2(xhc_0, vl);
+
+        vuint16m1_t xhc_1 = __riscv_vncvt_x_x_w_u16m1(xhl_1, vl);
+        vuint8mf2_t xh_1 = __riscv_vncvt_x_x_w_u8mf2(xhc_1, vl);
+
+        // load
+        vuint8mf2_t tx = __riscv_vle8_v_u8mf2(x[i].qs, vl);
+
+        vint8mf2_t y0 = __riscv_vle8_v_i8mf2(y[i].qs, vl);
+        vint8mf2_t y1 = __riscv_vle8_v_i8mf2(y[i].qs+16, vl);
+
+        vuint8mf2_t x_at = __riscv_vand_vx_u8mf2(tx, 0x0F, vl);
+        vuint8mf2_t x_lt = __riscv_vsrl_vx_u8mf2(tx, 0x04, vl);
+
+        vuint8mf2_t x_a = __riscv_vor_vv_u8mf2(x_at, xh_0, vl);
+        vuint8mf2_t x_l = __riscv_vor_vv_u8mf2(x_lt, xh_1, vl);
+
+        vint8mf2_t x_ai = __riscv_vreinterpret_v_u8mf2_i8mf2(x_a);
+        vint8mf2_t x_li = __riscv_vreinterpret_v_u8mf2_i8mf2(x_l);
+
+        vint8mf2_t v0 = __riscv_vsub_vx_i8mf2(x_ai, 16, vl);
+        vint8mf2_t v1 = __riscv_vsub_vx_i8mf2(x_li, 16, vl);
+
+        vint16m1_t vec_mul1 = __riscv_vwmul_vv_i16m1(v0, y0, vl);
+        vint16m1_t vec_mul2 = __riscv_vwmul_vv_i16m1(v1, y1, vl);
+
+        vint32m1_t vec_zero = __riscv_vmv_v_x_i32m1(0, vl);
+
+        vint32m1_t vs1 = __riscv_vwredsum_vs_i16m1_i32m1(vec_mul1, vec_zero, vl);
+        vint32m1_t vs2 = __riscv_vwredsum_vs_i16m1_i32m1(vec_mul2, vs1, vl);
+
+        int sumi = __riscv_vmv_x_s_i32m1_i32(vs2);
+
+        sumf += (GGML_FP16_TO_FP32(x[i].d)*GGML_FP16_TO_FP32(y[i].d)) * sumi;
+    }
+
+    *s = sumf;
+#else
+    // scalar
+    float sumf = 0.0;
+
+    for (int i = 0; i < nb; i++) {
+        uint32_t qh;
+        memcpy(&qh, x[i].qh, sizeof(qh));
+
+        int sumi = 0;
+
+        for (int j = 0; j < qk/2; ++j) {
+            const uint8_t xh_0 = ((qh & (1u << (j + 0 ))) >> (j + 0 )) << 4;
+            const uint8_t xh_1 = ((qh & (1u << (j + 16))) >> (j + 12));
+
+            const int32_t x0 = ((x[i].qs[j] & 0x0F) | xh_0) - 16;
+            const int32_t x1 = ((x[i].qs[j] >>   4) | xh_1) - 16;
+
+            sumi += (x0 * y[i].qs[j]) + (x1 * y[i].qs[j + qk/2]);
+        }
+
+        sumf += (GGML_FP16_TO_FP32(x[i].d)*GGML_FP16_TO_FP32(y[i].d)) * sumi;
+    }
+
+    *s = sumf;
+#endif
+}
+
+void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
+    const int qk = QK8_1;
+    const int nb = n / qk;
+
+    assert(n % qk == 0);
+    assert(qk == QK5_1);
+
+    const block_q5_1 * restrict x = vx;
+    const block_q8_1 * restrict y = vy;
+
+#if defined(__ARM_NEON)
+    float32x4_t sumv0 = vdupq_n_f32(0.0f);
+    float32x4_t sumv1 = vdupq_n_f32(0.0f);
+
+    float summs0 = 0.0f;
+    float summs1 = 0.0f;
+
+    uint32_t qh0;
+    uint32_t qh1;
+
+    uint64_t tmp0[4];
+    uint64_t tmp1[4];
+
+    assert(nb % 2 == 0); // TODO: handle odd nb
+
+    for (int i = 0; i < nb; i += 2) {
+        const block_q5_1 * restrict x0 = &x[i];
+        const block_q5_1 * restrict x1 = &x[i + 1];
+        const block_q8_1 * restrict y0 = &y[i];
+        const block_q8_1 * restrict y1 = &y[i + 1];
+
+        const uint8x16_t m4b = vdupq_n_u8(0x0F);
+
+        summs0 += GGML_FP16_TO_FP32(x0->m) * y0->s;
+        summs1 += GGML_FP16_TO_FP32(x1->m) * y1->s;
+
+        // extract the 5th bit via lookup table ((b) << 4)
+        memcpy(&qh0, x0->qh, sizeof(qh0));
+        memcpy(&qh1, x1->qh, sizeof(qh1));
+
+        tmp0[0] = table_b2b_0[(qh0 >>  0) & 0xFF];
+        tmp0[1] = table_b2b_0[(qh0 >>  8) & 0xFF];
+        tmp0[2] = table_b2b_0[(qh0 >> 16) & 0xFF];
+        tmp0[3] = table_b2b_0[(qh0 >> 24)       ];
+
+        tmp1[0] = table_b2b_0[(qh1 >>  0) & 0xFF];
+        tmp1[1] = table_b2b_0[(qh1 >>  8) & 0xFF];
+        tmp1[2] = table_b2b_0[(qh1 >> 16) & 0xFF];
+        tmp1[3] = table_b2b_0[(qh1 >> 24)       ];
+
+        const int8x16_t qhl0 = vld1q_s8((const int8_t *)(tmp0 + 0));
+        const int8x16_t qhh0 = vld1q_s8((const int8_t *)(tmp0 + 2));
+        const int8x16_t qhl1 = vld1q_s8((const int8_t *)(tmp1 + 0));
+        const int8x16_t qhh1 = vld1q_s8((const int8_t *)(tmp1 + 2));
+
+        const uint8x16_t v0_0 = vld1q_u8(x0->qs);
+        const uint8x16_t v0_1 = vld1q_u8(x1->qs);
+
+        // 4-bit -> 8-bit
+        const int8x16_t v0_0l = vreinterpretq_s8_u8(vandq_u8  (v0_0, m4b));
+        const int8x16_t v0_0h = vreinterpretq_s8_u8(vshrq_n_u8(v0_0, 4));
+        const int8x16_t v0_1l = vreinterpretq_s8_u8(vandq_u8  (v0_1, m4b));
+        const int8x16_t v0_1h = vreinterpretq_s8_u8(vshrq_n_u8(v0_1, 4));
+
+        // add high bit
+        const int8x16_t v0_0lf = vorrq_s8(v0_0l, qhl0);
+        const int8x16_t v0_0hf = vorrq_s8(v0_0h, qhh0);
+        const int8x16_t v0_1lf = vorrq_s8(v0_1l, qhl1);
+        const int8x16_t v0_1hf = vorrq_s8(v0_1h, qhh1);
+
+        // load y
+        const int8x16_t v1_0l = vld1q_s8(y0->qs);
+        const int8x16_t v1_0h = vld1q_s8(y0->qs + 16);
+        const int8x16_t v1_1l = vld1q_s8(y1->qs);
+        const int8x16_t v1_1h = vld1q_s8(y1->qs + 16);
+
+#if defined(__ARM_FEATURE_DOTPROD)
+        sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(
+                        vdotq_s32(vdupq_n_s32(0), v0_0lf, v1_0l),
+                        vdotq_s32(vdupq_n_s32(0), v0_0hf, v1_0h))), GGML_FP16_TO_FP32(x0->d)*y0->d);
+        sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(
+                        vdotq_s32(vdupq_n_s32(0), v0_1lf, v1_1l),
+                        vdotq_s32(vdupq_n_s32(0), v0_1hf, v1_1h))), GGML_FP16_TO_FP32(x1->d)*y1->d);
+#else
+        const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0_0lf), vget_low_s8 (v1_0l));
+        const int16x8_t pl0h = vmull_s8(vget_high_s8(v0_0lf), vget_high_s8(v1_0l));
+        const int16x8_t ph0l = vmull_s8(vget_low_s8 (v0_0hf), vget_low_s8 (v1_0h));
+        const int16x8_t ph0h = vmull_s8(vget_high_s8(v0_0hf), vget_high_s8(v1_0h));
+
+        const int16x8_t pl1l = vmull_s8(vget_low_s8 (v0_1lf), vget_low_s8 (v1_1l));
+        const int16x8_t pl1h = vmull_s8(vget_high_s8(v0_1lf), vget_high_s8(v1_1l));
+        const int16x8_t ph1l = vmull_s8(vget_low_s8 (v0_1hf), vget_low_s8 (v1_1h));
+        const int16x8_t ph1h = vmull_s8(vget_high_s8(v0_1hf), vget_high_s8(v1_1h));
+
+        const int32x4_t pl0 = vaddq_s32(vpaddlq_s16(pl0l), vpaddlq_s16(pl0h));
+        const int32x4_t ph0 = vaddq_s32(vpaddlq_s16(ph0l), vpaddlq_s16(ph0h));
+        const int32x4_t pl1 = vaddq_s32(vpaddlq_s16(pl1l), vpaddlq_s16(pl1h));
+        const int32x4_t ph1 = vaddq_s32(vpaddlq_s16(ph1l), vpaddlq_s16(ph1h));
+
+        sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(pl0, ph0)), GGML_FP16_TO_FP32(x0->d)*y0->d);
+        sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(pl1, ph1)), GGML_FP16_TO_FP32(x1->d)*y1->d);
+#endif
+    }
+
+    *s = vaddvq_f32(sumv0) + vaddvq_f32(sumv1) + summs0 + summs1;
+#elif defined(__wasm_simd128__)
+    v128_t sumv = wasm_f32x4_splat(0.0f);
+
+    float summs = 0.0f;
+
+    uint32_t qh;
+    uint64_t tmp[4];
+
+    // TODO: check if unrolling this is better
+    for (int i = 0; i < nb; ++i) {
+        const block_q5_1 * restrict x0 = &x[i];
+        const block_q8_1 * restrict y0 = &y[i];
+
+        summs += GGML_FP16_TO_FP32(x0->m) * y0->s;
+
+        const v128_t m4b = wasm_i8x16_splat(0x0F);
+
+        // extract the 5th bit
+        memcpy(&qh, x0->qh, sizeof(qh));
+
+        tmp[0] = table_b2b_0[(qh >>  0) & 0xFF];
+        tmp[1] = table_b2b_0[(qh >>  8) & 0xFF];
+        tmp[2] = table_b2b_0[(qh >> 16) & 0xFF];
+        tmp[3] = table_b2b_0[(qh >> 24)       ];
+
+        const v128_t qhl = wasm_v128_load(tmp + 0);
+        const v128_t qhh = wasm_v128_load(tmp + 2);
+
+        const v128_t v0 = wasm_v128_load(x0->qs);
+
+        // 4-bit -> 8-bit
+        const v128_t v0l = wasm_v128_and (v0, m4b);
+        const v128_t v0h = wasm_u8x16_shr(v0, 4);
+
+        // add high bit
+        const v128_t v0lf = wasm_v128_or(v0l, qhl);
+        const v128_t v0hf = wasm_v128_or(v0h, qhh);
+
+        // load y
+        const v128_t v1l = wasm_v128_load(y0->qs);
+        const v128_t v1h = wasm_v128_load(y0->qs + 16);
+
+        // int8x16 -> int16x8
+        const v128_t v0lfl = wasm_i16x8_extend_low_i8x16 (v0lf);
+        const v128_t v0lfh = wasm_i16x8_extend_high_i8x16(v0lf);
+        const v128_t v0hfl = wasm_i16x8_extend_low_i8x16 (v0hf);
+        const v128_t v0hfh = wasm_i16x8_extend_high_i8x16(v0hf);
+
+        const v128_t v1ll = wasm_i16x8_extend_low_i8x16 (v1l);
+        const v128_t v1lh = wasm_i16x8_extend_high_i8x16(v1l);
+        const v128_t v1hl = wasm_i16x8_extend_low_i8x16 (v1h);
+        const v128_t v1hh = wasm_i16x8_extend_high_i8x16(v1h);
+
+        // dot product
+        sumv = wasm_f32x4_add(sumv,
+                wasm_f32x4_mul(wasm_f32x4_convert_i32x4(wasm_i32x4_add(
+                            wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0lfl, v1ll),
+                                           wasm_i32x4_dot_i16x8(v0lfh, v1lh)),
+                            wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0hfl, v1hl),
+                                           wasm_i32x4_dot_i16x8(v0hfh, v1hh)))),
+                    wasm_f32x4_splat(GGML_FP16_TO_FP32(x0->d) * y0->d)));
+    }
+
+    *s = wasm_f32x4_extract_lane(sumv, 0) + wasm_f32x4_extract_lane(sumv, 1) +
+         wasm_f32x4_extract_lane(sumv, 2) + wasm_f32x4_extract_lane(sumv, 3) + summs;
+#elif defined(__AVX2__)
+    // Initialize accumulator with zeros
+    __m256 acc = _mm256_setzero_ps();
+
+    float summs = 0.0f;
+
+    // Main loop
+    for (int i = 0; i < nb; i++) {
+        const __m256 dx = _mm256_set1_ps(GGML_FP16_TO_FP32(x[i].d));
+
+        summs += GGML_FP16_TO_FP32(x[i].m) * y[i].s;
+
+        __m256i bx = bytes_from_nibbles_32(x[i].qs);
+        __m256i bxhi = bytes_from_bits_32(x[i].qh);
+        bxhi = _mm256_and_si256(bxhi, _mm256_set1_epi8(0x10));
+        bx = _mm256_or_si256(bx, bxhi);
+
+        const __m256 dy = _mm256_set1_ps(y[i].d);
+        const __m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs);
+
+        const __m256 q = mul_sum_us8_pairs_float(bx, by);
+
+        acc = _mm256_fmadd_ps(q, _mm256_mul_ps(dx, dy), acc);
+    }
+
+    *s = hsum_float_8(acc) + summs;
+#elif defined(__AVX__)
+    // Initialize accumulator with zeros
+    __m256 acc = _mm256_setzero_ps();
+    __m128i mask = _mm_set1_epi8(0x10);
+
+    float summs = 0.0f;
+
+    // Main loop
+    for (int i = 0; i < nb; i++) {
+        const __m256 dx = _mm256_set1_ps(GGML_FP16_TO_FP32(x[i].d));
+
+        summs += GGML_FP16_TO_FP32(x[i].m) * y[i].s;
+
+        __m256i bx = bytes_from_nibbles_32(x[i].qs);
+        const __m256i bxhi = bytes_from_bits_32(x[i].qh);
+        __m128i bxhil = _mm256_castsi256_si128(bxhi);
+        __m128i bxhih = _mm256_extractf128_si256(bxhi, 1);
+        bxhil = _mm_and_si128(bxhil, mask);
+        bxhih = _mm_and_si128(bxhih, mask);
+        __m128i bxl = _mm256_castsi256_si128(bx);
+        __m128i bxh = _mm256_extractf128_si256(bx, 1);
+        bxl = _mm_or_si128(bxl, bxhil);
+        bxh = _mm_or_si128(bxh, bxhih);
+        bx = MM256_SET_M128I(bxh, bxl);
+
+        const __m256 dy = _mm256_set1_ps(y[i].d);
+        const __m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs);
+
+        const __m256 q = mul_sum_us8_pairs_float(bx, by);
+
+        acc = _mm256_add_ps(_mm256_mul_ps(q, _mm256_mul_ps(dx, dy)), acc);
+    }
+
+    *s = hsum_float_8(acc) + summs;
+#elif defined(__riscv_v_intrinsic)
+    float sumf = 0.0;
+
+    uint32_t qh;
+
+    size_t vl = __riscv_vsetvl_e8m1(qk/2);
+
+    // temporary registers for shift operations
+    vuint32m2_t vt_1 = __riscv_vid_v_u32m2(vl);
+    vuint32m2_t vt_2 = __riscv_vadd_vx_u32m2(vt_1, 12, vl);
+
+    for (int i = 0; i < nb; i++) {
+        memcpy(&qh, x[i].qh, sizeof(uint32_t));
+
+        // load qh
+        vuint32m2_t vqh = __riscv_vmv_v_x_u32m2(qh, vl);
+
+        // ((qh >> (j +  0)) << 4) & 0x10;
+        vuint32m2_t xhr_0 = __riscv_vsrl_vv_u32m2(vqh, vt_1, vl);
+        vuint32m2_t xhl_0 = __riscv_vsll_vx_u32m2(xhr_0, 4, vl);
+        vuint32m2_t xha_0 = __riscv_vand_vx_u32m2(xhl_0, 0x10, vl);
+
+        // ((qh >> (j + 12))     ) & 0x10;
+        vuint32m2_t xhr_1 = __riscv_vsrl_vv_u32m2(vqh, vt_2, vl);
+        vuint32m2_t xha_1 = __riscv_vand_vx_u32m2(xhr_1, 0x10, vl);
+
+        // narrowing
+        vuint16m1_t xhc_0 = __riscv_vncvt_x_x_w_u16m1(xha_0, vl);
+        vuint8mf2_t xh_0 = __riscv_vncvt_x_x_w_u8mf2(xhc_0, vl);
+
+        vuint16m1_t xhc_1 = __riscv_vncvt_x_x_w_u16m1(xha_1, vl);
+        vuint8mf2_t xh_1 = __riscv_vncvt_x_x_w_u8mf2(xhc_1, vl);
+
+        // load
+        vuint8mf2_t tx = __riscv_vle8_v_u8mf2(x[i].qs, vl);
+
+        vint8mf2_t y0 = __riscv_vle8_v_i8mf2(y[i].qs, vl);
+        vint8mf2_t y1 = __riscv_vle8_v_i8mf2(y[i].qs+16, vl);
+
+        vuint8mf2_t x_at = __riscv_vand_vx_u8mf2(tx, 0x0F, vl);
+        vuint8mf2_t x_lt = __riscv_vsrl_vx_u8mf2(tx, 0x04, vl);
+
+        vuint8mf2_t x_a = __riscv_vor_vv_u8mf2(x_at, xh_0, vl);
+        vuint8mf2_t x_l = __riscv_vor_vv_u8mf2(x_lt, xh_1, vl);
+
+        vint8mf2_t v0 = __riscv_vreinterpret_v_u8mf2_i8mf2(x_a);
+        vint8mf2_t v1 = __riscv_vreinterpret_v_u8mf2_i8mf2(x_l);
+
+        vint16m1_t vec_mul1 = __riscv_vwmul_vv_i16m1(v0, y0, vl);
+        vint16m1_t vec_mul2 = __riscv_vwmul_vv_i16m1(v1, y1, vl);
+
+        vint32m1_t vec_zero = __riscv_vmv_v_x_i32m1(0, vl);
+
+        vint32m1_t vs1 = __riscv_vwredsum_vs_i16m1_i32m1(vec_mul1, vec_zero, vl);
+        vint32m1_t vs2 = __riscv_vwredsum_vs_i16m1_i32m1(vec_mul2, vs1, vl);
+
+        int sumi = __riscv_vmv_x_s_i32m1_i32(vs2);
+
+        sumf += (GGML_FP16_TO_FP32(x[i].d)*y[i].d)*sumi + GGML_FP16_TO_FP32(x[i].m)*y[i].s;
+    }
+
+    *s = sumf;
+#else
+    // scalar
+    float sumf = 0.0;
+
+    for (int i = 0; i < nb; i++) {
+        uint32_t qh;
+        memcpy(&qh, x[i].qh, sizeof(qh));
+
+        int sumi = 0;
+
+        for (int j = 0; j < qk/2; ++j) {
+            const uint8_t xh_0 = ((qh >> (j +  0)) << 4) & 0x10;
+            const uint8_t xh_1 = ((qh >> (j + 12))     ) & 0x10;
+
+            const int32_t x0 = (x[i].qs[j] & 0xF) | xh_0;
+            const int32_t x1 = (x[i].qs[j] >>  4) | xh_1;
+
+            sumi += (x0 * y[i].qs[j]) + (x1 * y[i].qs[j + qk/2]);
+        }
+
+        sumf += (GGML_FP16_TO_FP32(x[i].d)*y[i].d)*sumi + GGML_FP16_TO_FP32(x[i].m)*y[i].s;
+    }
+
+    *s = sumf;
+#endif
+}
+
+void ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
+    const int qk = QK8_0;
+    const int nb = n / qk;
+
+    assert(n % qk == 0);
+
+    const block_q8_0 * restrict x = vx;
+    const block_q8_0 * restrict y = vy;
+
+#if defined(__ARM_NEON)
+    float32x4_t sumv0 = vdupq_n_f32(0.0f);
+    float32x4_t sumv1 = vdupq_n_f32(0.0f);
+
+    assert(nb % 2 == 0); // TODO: handle odd nb
+
+    for (int i = 0; i < nb; i += 2) {
+        const block_q8_0 * restrict x0 = &x[i + 0];
+        const block_q8_0 * restrict x1 = &x[i + 1];
+        const block_q8_0 * restrict y0 = &y[i + 0];
+        const block_q8_0 * restrict y1 = &y[i + 1];
+
+        const int8x16_t x0_0 = vld1q_s8(x0->qs);
+        const int8x16_t x0_1 = vld1q_s8(x0->qs + 16);
+        const int8x16_t x1_0 = vld1q_s8(x1->qs);
+        const int8x16_t x1_1 = vld1q_s8(x1->qs + 16);
+
+        // load y
+        const int8x16_t y0_0 = vld1q_s8(y0->qs);
+        const int8x16_t y0_1 = vld1q_s8(y0->qs + 16);
+        const int8x16_t y1_0 = vld1q_s8(y1->qs);
+        const int8x16_t y1_1 = vld1q_s8(y1->qs + 16);
+
+#if defined(__ARM_FEATURE_DOTPROD)
+        sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(
+                        vdotq_s32(vdupq_n_s32(0), x0_0, y0_0),
+                        vdotq_s32(vdupq_n_s32(0), x0_1, y0_1))), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
+
+        sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(
+                        vdotq_s32(vdupq_n_s32(0), x1_0, y1_0),
+                        vdotq_s32(vdupq_n_s32(0), x1_1, y1_1))), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
+
+#else
+        const int16x8_t p0_0 = vmull_s8(vget_low_s8 (x0_0), vget_low_s8 (y0_0));
+        const int16x8_t p0_1 = vmull_s8(vget_high_s8(x0_0), vget_high_s8(y0_0));
+        const int16x8_t p0_2 = vmull_s8(vget_low_s8 (x0_1), vget_low_s8 (y0_1));
+        const int16x8_t p0_3 = vmull_s8(vget_high_s8(x0_1), vget_high_s8(y0_1));
+
+        const int16x8_t p1_0 = vmull_s8(vget_low_s8 (x1_0), vget_low_s8 (y1_0));
+        const int16x8_t p1_1 = vmull_s8(vget_high_s8(x1_0), vget_high_s8(y1_0));
+        const int16x8_t p1_2 = vmull_s8(vget_low_s8 (x1_1), vget_low_s8 (y1_1));
+        const int16x8_t p1_3 = vmull_s8(vget_high_s8(x1_1), vget_high_s8(y1_1));
+
+        const int32x4_t p0 = vaddq_s32(vpaddlq_s16(p0_0), vpaddlq_s16(p0_1));
+        const int32x4_t p1 = vaddq_s32(vpaddlq_s16(p0_2), vpaddlq_s16(p0_3));
+        const int32x4_t p2 = vaddq_s32(vpaddlq_s16(p1_0), vpaddlq_s16(p1_1));
+        const int32x4_t p3 = vaddq_s32(vpaddlq_s16(p1_2), vpaddlq_s16(p1_3));
+
+        sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(p0, p1)), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
+        sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(p2, p3)), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
+#endif
+    }
+
+    *s = vaddvq_f32(sumv0) + vaddvq_f32(sumv1);
+#elif defined(__AVX2__) || defined(__AVX__)
+    // Initialize accumulator with zeros
+    __m256 acc = _mm256_setzero_ps();
+
+    // Main loop
+    for (int i = 0; i < nb; ++i) {
+        // Compute combined scale for the block
+        const __m256 d = _mm256_set1_ps(GGML_FP16_TO_FP32(x[i].d) * GGML_FP16_TO_FP32(y[i].d));
+        __m256i bx = _mm256_loadu_si256((const __m256i *)x[i].qs);
+        __m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs);
+
+        const __m256 q = mul_sum_i8_pairs_float(bx, by);
+
+        // Multiply q with scale and accumulate
+#if defined(__AVX2__)
+        acc = _mm256_fmadd_ps( d, q, acc );
+#else
+        acc = _mm256_add_ps( _mm256_mul_ps( d, q ), acc );
+#endif
+    }
+
+    *s = hsum_float_8(acc);
+#elif defined(__riscv_v_intrinsic)
+    float sumf = 0.0;
+    size_t vl = __riscv_vsetvl_e8m1(qk);
+
+    for (int i = 0; i < nb; i++) {
+        // load elements
+        vint8m1_t bx = __riscv_vle8_v_i8m1(x[i].qs, vl);
+        vint8m1_t by = __riscv_vle8_v_i8m1(y[i].qs, vl);
+
+        vint16m2_t vw_mul = __riscv_vwmul_vv_i16m2(bx, by, vl);
+
+        vint32m1_t v_zero = __riscv_vmv_v_x_i32m1(0, vl);
+        vint32m1_t v_sum = __riscv_vwredsum_vs_i16m2_i32m1(vw_mul, v_zero, vl);
+
+        int sumi = __riscv_vmv_x_s_i32m1_i32(v_sum);
+
+        sumf += sumi*(GGML_FP16_TO_FP32(x[i].d)*GGML_FP16_TO_FP32(y[i].d));
+    }
+
+    *s = sumf;
+#else
+    // scalar
+    float sumf = 0.0;
+
+    for (int i = 0; i < nb; i++) {
+        int sumi = 0;
+
+        for (int j = 0; j < qk; j++) {
+            sumi += x[i].qs[j]*y[i].qs[j];
+        }
+
+        sumf += sumi*(GGML_FP16_TO_FP32(x[i].d)*GGML_FP16_TO_FP32(y[i].d));
+    }
+
+    *s = sumf;
+#endif
+}
+
 #if QK_K == 256
 void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
 
@@ -1342,8 +3564,8 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
 
     for (int i = 0; i < nb; ++i) {
 
-        const float d = y[i].d * ggml_fp16_to_fp32(x[i].d);
-        const float dmin = -y[i].d * ggml_fp16_to_fp32(x[i].dmin);
+        const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
+        const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
 
         const uint8_t * restrict q2 = x[i].qs;
         const int8_t  * restrict q8 = y[i].qs;
@@ -1421,8 +3643,8 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
 
     for (int i = 0; i < nb; ++i) {
 
-        const float d = y[i].d * ggml_fp16_to_fp32(x[i].d);
-        const float dmin = -y[i].d * ggml_fp16_to_fp32(x[i].dmin);
+        const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
+        const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
 
         const uint8_t * restrict q2 = x[i].qs;
         const int8_t  * restrict q8 = y[i].qs;
@@ -1488,8 +3710,8 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
 
     for (int i = 0; i < nb; ++i) {
 
-        const float dall = y[i].d * ggml_fp16_to_fp32(x[i].d);
-        const float dmin = -y[i].d * ggml_fp16_to_fp32(x[i].dmin);
+        const float dall = y[i].d * GGML_FP16_TO_FP32(x[i].d);
+        const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
 
         const uint8_t * restrict q2 = x[i].qs;
         const int8_t  * restrict q8 = y[i].qs;
@@ -1596,8 +3818,8 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
         const  int8_t * q8 = y[i].qs;
         const uint8_t * sc = x[i].scales;
 
-        const float dall = y[i].d * ggml_fp16_to_fp32(x[i].d);
-        const float dmin = -y[i].d * ggml_fp16_to_fp32(x[i].dmin);
+        const float dall = y[i].d * GGML_FP16_TO_FP32(x[i].d);
+        const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
 
         size_t vl = 16;
 
@@ -1683,8 +3905,8 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
             summs += y[i].bsums[j] * (sc[j] >> 4);
         }
 
-        const float dall = y[i].d * ggml_fp16_to_fp32(x[i].d);
-        const float dmin = y[i].d * ggml_fp16_to_fp32(x[i].dmin);
+        const float dall = y[i].d * GGML_FP16_TO_FP32(x[i].d);
+        const float dmin = y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
 
         int isum = 0;
         int is = 0;
@@ -1801,8 +4023,8 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
 
     for (int i = 0; i < nb; ++i) {
 
-        const float d = y[i].d * ggml_fp16_to_fp32(x[i].d);
-        const float dmin = -y[i].d * ggml_fp16_to_fp32(x[i].dmin);
+        const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
+        const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
 
         const uint8_t * restrict q2 = x[i].qs;
         const int8_t  * restrict q8 = y[i].qs;
@@ -1853,8 +4075,8 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
 
     for (int i = 0; i < nb; ++i) {
 
-        const float d = y[i].d * ggml_fp16_to_fp32(x[i].d);
-        const float dmin = -y[i].d * ggml_fp16_to_fp32(x[i].dmin);
+        const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
+        const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
 
         const uint8_t * restrict q2 = x[i].qs;
         const int8_t  * restrict q8 = y[i].qs;
@@ -1968,8 +4190,8 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
             summs += y[i].bsums[j] * (sc[j] >> 4);
         }
 
-        const float dall = y[i].d * ggml_fp16_to_fp32(x[i].d);
-        const float dmin = y[i].d * ggml_fp16_to_fp32(x[i].dmin);
+        const float dall = y[i].d * GGML_FP16_TO_FP32(x[i].d);
+        const float dmin = y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
 
         isum[0] = isum[1] = isum[2] = isum[3] = 0;
         for (int l =  0; l < 16; ++l) {
@@ -2022,7 +4244,7 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
 
     for (int i = 0; i < nb; ++i) {
 
-        const float d = y[i].d * ggml_fp16_to_fp32(x[i].d);
+        const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
 
         const uint8_t * restrict q3 = x[i].qs;
         const uint8_t * restrict qh = x[i].hmask;
@@ -2130,7 +4352,7 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
 
     for (int i = 0; i < nb; ++i) {
 
-        const float d = y[i].d * ggml_fp16_to_fp32(x[i].d);
+        const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
 
         const uint8_t * restrict q3 = x[i].qs;
         const int8_t  * restrict q8 = y[i].qs;
@@ -2235,7 +4457,7 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
 
     for (int i = 0; i < nb; ++i) {
 
-        const float d = y[i].d * ggml_fp16_to_fp32(x[i].d);
+        const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
 
         const uint8_t * restrict q3 = x[i].qs;
         const int8_t  * restrict q8 = y[i].qs;
@@ -2456,7 +4678,7 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
 
         }
 
-        const float d = ggml_fp16_to_fp32(x[i].d) * y[i].d;
+        const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
 
         sumf += d*sum_t;
 
@@ -2521,7 +4743,7 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
             for (int l = 0; l < 8; ++l) aux32[l] += (scales[j] - 32) * aux16[l];
             q8 += 8; a += 8;
         }
-        const float d = ggml_fp16_to_fp32(x[i].d) * y[i].d;
+        const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
         for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
     }
     for (int l = 0; l < 8; ++l) sumf += sums[l];
@@ -2623,7 +4845,7 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
 
     for (int i = 0; i < nb; ++i) {
 
-        const float d = y[i].d * ggml_fp16_to_fp32(x[i].d);
+        const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
 
         const uint8_t * restrict q3 = x[i].qs;
         const int8_t  * restrict q8 = y[i].qs;
@@ -2694,7 +4916,7 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
 
     for (int i = 0; i < nb; ++i) {
 
-        const float d = y[i].d * ggml_fp16_to_fp32(x[i].d);
+        const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
 
         const uint8_t * restrict q3 = x[i].qs;
         const int8_t  * restrict q8 = y[i].qs;
@@ -2879,7 +5101,7 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
             q8 += 8; a += 8;
             for (int l = 0; l < 8; ++l) aux32[l] += scales[j] * aux16[l];
         }
-        const float d = ggml_fp16_to_fp32(x[i].d) * y[i].d;
+        const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
         for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
     }
     for (int l = 0; l < 8; ++l) sumf += sums[l];
@@ -2919,8 +5141,8 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
 
     for (int i = 0; i < nb; ++i) {
 
-        const float d = y[i].d * ggml_fp16_to_fp32(x[i].d);
-        const float dmin = y[i].d * ggml_fp16_to_fp32(x[i].dmin);
+        const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
+        const float dmin = y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
 
         const int16x8_t q8sums = vpaddq_s16(vld1q_s16(y[i].bsums), vld1q_s16(y[i].bsums + 8));
 
@@ -3002,8 +5224,8 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
 
    for (int i = 0; i < nb; ++i) {
 
-        const float d = y[i].d * ggml_fp16_to_fp32(x[i].d);
-        const float dmin = -y[i].d * ggml_fp16_to_fp32(x[i].dmin);
+        const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
+        const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
 
         memcpy(utmp, x[i].scales, 12);
         utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
@@ -3068,8 +5290,8 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
 
    for (int i = 0; i < nb; ++i) {
 
-        const float d = y[i].d * ggml_fp16_to_fp32(x[i].d);
-        const float dmin = -y[i].d * ggml_fp16_to_fp32(x[i].dmin);
+        const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
+        const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
 
         const uint8_t * restrict q4 = x[i].qs;
         const int8_t  * restrict q8 = y[i].qs;
@@ -3151,8 +5373,8 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
 
         size_t vl = 8;
 
-        const float d = y[i].d * ggml_fp16_to_fp32(x[i].d);
-        const float dmin = y[i].d * ggml_fp16_to_fp32(x[i].dmin);
+        const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
+        const float dmin = y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
 
         vint16mf2_t q8sums_0 = __riscv_vlse16_v_i16mf2(y[i].bsums, 4, vl);
         vint16mf2_t q8sums_1 = __riscv_vlse16_v_i16mf2(y[i].bsums+1, 4, vl);
@@ -3262,9 +5484,9 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
             for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
             q8 += 8; a += 8;
         }
-        const float d = ggml_fp16_to_fp32(x[i].d) * y[i].d;
+        const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
         for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
-        const float dmin = ggml_fp16_to_fp32(x[i].dmin) * y[i].d;
+        const float dmin = GGML_FP16_TO_FP32(x[i].dmin) * y[i].d;
         sumf -= dmin * sumi;
     }
     for (int l = 0; l < 8; ++l) sumf += sums[l];
@@ -3366,8 +5588,8 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
 
     for (int i = 0; i < nb; ++i) {
 
-        const float d = ggml_fp16_to_fp32(x[i].d[0]) * y[i].d;
-        const float m = ggml_fp16_to_fp32(x[i].d[1]) * y[i].d;
+        const float d = GGML_FP16_TO_FP32(x[i].d[0]) * y[i].d;
+        const float m = GGML_FP16_TO_FP32(x[i].d[1]) * y[i].d;
         const __m256 vd = _mm256_set1_ps(d);
 
         const uint16_t * a = (const uint16_t *)x[i].scales;
@@ -3412,8 +5634,8 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
 
     for (int i = 0; i < nb; ++i) {
 
-        const float d = ggml_fp16_to_fp32(x[i].d[0]) * y[i].d;
-        const float m = ggml_fp16_to_fp32(x[i].d[1]) * y[i].d;
+        const float d = GGML_FP16_TO_FP32(x[i].d[0]) * y[i].d;
+        const float m = GGML_FP16_TO_FP32(x[i].d[1]) * y[i].d;
         const __m256 vd = _mm256_set1_ps(d);
 
         const uint16_t * a = (const uint16_t *)x[i].scales;
@@ -3469,8 +5691,8 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
         s16[0] = b[0] & 0x0f0f;
         s16[1] = (b[0] >> 4) & 0x0f0f;
 
-        sumf -= y[i].d * ggml_fp16_to_fp32(x[i].d[1]) * (scales[2] * (y[i].bsums[0] + y[i].bsums[1]) + scales[3] * (y[i].bsums[2] + y[i].bsums[3]));
-        const float d = y[i].d * ggml_fp16_to_fp32(x[i].d[0]);
+        sumf -= y[i].d * GGML_FP16_TO_FP32(x[i].d[1]) * (scales[2] * (y[i].bsums[0] + y[i].bsums[1]) + scales[3] * (y[i].bsums[2] + y[i].bsums[3]));
+        const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d[0]);
 
         size_t vl = 32;
 
@@ -3519,9 +5741,9 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
         s16[0] = b[0] & 0x0f0f;
         s16[1] = (b[0] >> 4) & 0x0f0f;
 
-        sumf -= y[i].d * ggml_fp16_to_fp32(x[i].d[1]) * (scales[2] * (y[i].bsums[0] + y[i].bsums[1]) + scales[3] * (y[i].bsums[2] + y[i].bsums[3]));
+        sumf -= y[i].d * GGML_FP16_TO_FP32(x[i].d[1]) * (scales[2] * (y[i].bsums[0] + y[i].bsums[1]) + scales[3] * (y[i].bsums[2] + y[i].bsums[3]));
 
-        const float d = y[i].d * ggml_fp16_to_fp32(x[i].d[0]);
+        const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d[0]);
 
         for (int j = 0; j < QK_K/32; ++j) {
             for (int l = 0; l < 16; ++l) aux16[l] = q8[l] * a[l];
@@ -3569,8 +5791,8 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
 
     for (int i = 0; i < nb; ++i) {
 
-        const float d = y[i].d * ggml_fp16_to_fp32(x[i].d);
-        const float dmin = y[i].d * ggml_fp16_to_fp32(x[i].dmin);
+        const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
+        const float dmin = y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
 
         const int16x8_t q8sums = vpaddq_s16(vld1q_s16(y[i].bsums), vld1q_s16(y[i].bsums + 8));
 
@@ -3658,8 +5880,8 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
         const int8_t  * restrict q8 = y[i].qs;
 
 #if QK_K == 256
-        const float d = y[i].d * ggml_fp16_to_fp32(x[i].d);
-        const float dmin = -y[i].d * ggml_fp16_to_fp32(x[i].dmin);
+        const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
+        const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
 
         memcpy(utmp, x[i].scales, 12);
         utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
@@ -3740,8 +5962,8 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
 
     for (int i = 0; i < nb; ++i) {
 
-        const float d = y[i].d * ggml_fp16_to_fp32(x[i].d);
-        const float dmin = -y[i].d * ggml_fp16_to_fp32(x[i].dmin);
+        const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
+        const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
 
         const uint8_t * restrict q5 = x[i].qs;
         const int8_t  * restrict q8 = y[i].qs;
@@ -3845,8 +6067,8 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
         const uint8_t * restrict hm = x[i].qh;
         const  int8_t * restrict q8 = y[i].qs;
 
-        const float d = ggml_fp16_to_fp32(x[i].d) * y[i].d;
-        const float dmin = ggml_fp16_to_fp32(x[i].dmin) * y[i].d;
+        const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
+        const float dmin = GGML_FP16_TO_FP32(x[i].dmin) * y[i].d;
 
         vint16mf2_t q8sums_0 = __riscv_vlse16_v_i16mf2(y[i].bsums, 4, vl);
         vint16mf2_t q8sums_1 = __riscv_vlse16_v_i16mf2(y[i].bsums+1, 4, vl);
@@ -3968,9 +6190,9 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
             for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
             q8 += 8; a += 8;
         }
-        const float d = ggml_fp16_to_fp32(x[i].d) * y[i].d;
+        const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
         for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
-        const float dmin = ggml_fp16_to_fp32(x[i].dmin) * y[i].d;
+        const float dmin = GGML_FP16_TO_FP32(x[i].dmin) * y[i].d;
         sumf -= dmin * sumi;
     }
     for (int l = 0; l < 8; ++l) sumf += sums[l];
@@ -4068,7 +6290,7 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
         const uint8_t * restrict q5 = x[i].qs;
         const int8_t  * restrict q8 = y[i].qs;
 
-        const float d = y[i].d * ggml_fp16_to_fp32(x[i].d);
+        const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
 
         const __m256i q5bits = _mm256_loadu_si256((const __m256i*)q5);
 
@@ -4114,7 +6336,7 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
         const uint8_t * restrict q5 = x[i].qs;
         const int8_t  * restrict q8 = y[i].qs;
 
-        const float d = y[i].d * ggml_fp16_to_fp32(x[i].d);
+        const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
 
         const __m256i q5bits = _mm256_loadu_si256((const __m256i*)q5);
 
@@ -4251,7 +6473,7 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
             for (int l = 0; l < 8; ++l) a[8*is + l] -= (hm[l] & m ? 0 : 16);
         }
 
-        const float d = y[i].d * ggml_fp16_to_fp32(x[i].d);
+        const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
         const int8_t * restrict sc = x[i].scales;
 
         for (int j = 0; j < QK_K/16; ++j) {
@@ -4294,7 +6516,7 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
 
     for (int i = 0; i < nb; ++i) {
 
-        const float d_all = ggml_fp16_to_fp32(x[i].d);
+        const float d_all = GGML_FP16_TO_FP32(x[i].d);
 
         const uint8_t * restrict q6 = x[i].ql;
         const uint8_t * restrict qh = x[i].qh;
@@ -4426,7 +6648,7 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
 
     for (int i = 0; i < nb; ++i) {
 
-        const float d = y[i].d * ggml_fp16_to_fp32(x[i].d);
+        const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
 
         const uint8_t * restrict q4 = x[i].ql;
         const uint8_t * restrict qh = x[i].qh;
@@ -4506,7 +6728,7 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
 
     for (int i = 0; i < nb; ++i) {
 
-        const float d = y[i].d * ggml_fp16_to_fp32(x[i].d);
+        const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
 
         const uint8_t * restrict q4 = x[i].ql;
         const uint8_t * restrict qh = x[i].qh;
@@ -4618,7 +6840,7 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
     float sumf = 0;
     for (int i = 0; i < nb; ++i) {
 
-        const float d = ggml_fp16_to_fp32(x[i].d) * y[i].d;
+        const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
 
         const uint8_t * restrict q6 = x[i].ql;
         const uint8_t * restrict qh = x[i].qh;
@@ -4735,7 +6957,7 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
             for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
             q8 += 8; a += 8;
         }
-        const float d = ggml_fp16_to_fp32(x[i].d) * y[i].d;
+        const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
         for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
     }
     for (int l = 0; l < 8; ++l) sumf += sums[l];
@@ -4833,7 +7055,7 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
 
     for (int i = 0; i < nb; ++i) {
 
-        const float d = y[i].d * ggml_fp16_to_fp32(x[i].d);
+        const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
 
         const uint8_t * restrict q4 = x[i].ql;
         const uint8_t * restrict qh = x[i].qh;
@@ -4890,7 +7112,7 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
 
     for (int i = 0; i < nb; ++i) {
 
-        const float d = y[i].d * ggml_fp16_to_fp32(x[i].d);
+        const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
 
         const uint8_t * restrict q4 = x[i].ql;
         const uint8_t * restrict qh = x[i].qh;
@@ -5049,7 +7271,7 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
             for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
             q8 += 8; a += 8;
         }
-        const float d = ggml_fp16_to_fp32(x[i].d) * y[i].d;
+        const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
         for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
     }
     for (int l = 0; l < 8; ++l) sumf += sums[l];
diff --git a/k_quants.h b/ggml-quants.h
similarity index 63%
rename from k_quants.h
rename to ggml-quants.h
index 9de089e7a..70c12c274 100644
--- a/k_quants.h
+++ b/ggml-quants.h
@@ -1,11 +1,63 @@
 #pragma once
 
-#include "ggml.h"
+#include "ggml-impl.h"
+
+// GGML internal header
 
 #include <stdint.h>
-#include <assert.h>
 #include <stddef.h>
 
+#define QK4_0 32
+typedef struct {
+    ggml_fp16_t d;          // delta
+    uint8_t qs[QK4_0 / 2];  // nibbles / quants
+} block_q4_0;
+static_assert(sizeof(block_q4_0) == sizeof(ggml_fp16_t) + QK4_0 / 2, "wrong q4_0 block size/padding");
+
+#define QK4_1 32
+typedef struct {
+    ggml_fp16_t d;          // delta
+    ggml_fp16_t m;          // min
+    uint8_t qs[QK4_1 / 2];  // nibbles / quants
+} block_q4_1;
+static_assert(sizeof(block_q4_1) == 2 * sizeof(ggml_fp16_t) + QK4_1 / 2, "wrong q4_1 block size/padding");
+
+#define QK5_0 32
+typedef struct {
+    ggml_fp16_t d;         // delta
+    uint8_t qh[4];         // 5-th bit of quants
+    uint8_t qs[QK5_0 / 2]; // nibbles / quants
+} block_q5_0;
+static_assert(sizeof(block_q5_0) == sizeof(ggml_fp16_t) + sizeof(uint32_t) + QK5_0 / 2, "wrong q5_0 block size/padding");
+
+#define QK5_1 32
+typedef struct {
+    ggml_fp16_t d;         // delta
+    ggml_fp16_t m;         // min
+    uint8_t qh[4];         // 5-th bit of quants
+    uint8_t qs[QK5_1 / 2]; // nibbles / quants
+} block_q5_1;
+static_assert(sizeof(block_q5_1) == 2 * sizeof(ggml_fp16_t) + sizeof(uint32_t) + QK5_1 / 2, "wrong q5_1 block size/padding");
+
+#define QK8_0 32
+typedef struct {
+    ggml_fp16_t d;         // delta
+    int8_t  qs[QK8_0];     // quants
+} block_q8_0;
+static_assert(sizeof(block_q8_0) == sizeof(ggml_fp16_t) + QK8_0, "wrong q8_0 block size/padding");
+
+#define QK8_1 32
+typedef struct {
+    float d;               // delta
+    float s;               // d * sum(qs[i])
+    int8_t  qs[QK8_1];     // quants
+} block_q8_1;
+static_assert(sizeof(block_q8_1) == 2*sizeof(float) + QK8_1, "wrong q8_1 block size/padding");
+
+//
+// Super-block quantization structures
+//
+
 // Super-block size
 #ifdef GGML_QKK_64
 #define QK_K 64
@@ -15,18 +67,6 @@
 #define K_SCALE_SIZE 12
 #endif
 
-#ifndef static_assert
-#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201100L)
-#define static_assert(cond, msg) _Static_assert(cond, msg)
-#else
-#define static_assert(cond, msg) struct global_scope_noop_trick
-#endif
-#endif
-
-//
-// Super-block quantization structures
-//
-
 // 2-bit quantization
 // weight is represented as x = a * q + b
 // 16 blocks of 16 elements each
@@ -127,6 +167,13 @@ static_assert(sizeof(block_q8_K) == sizeof(float) + QK_K + QK_K/16*sizeof(int16_
 
 
 // Quantization
+void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * restrict y, int k);
+void quantize_row_q4_1_reference(const float * restrict x, block_q4_1 * restrict y, int k);
+void quantize_row_q5_0_reference(const float * restrict x, block_q5_0 * restrict y, int k);
+void quantize_row_q5_1_reference(const float * restrict x, block_q5_1 * restrict y, int k);
+void quantize_row_q8_0_reference(const float * restrict x, block_q8_0 * restrict y, int k);
+void quantize_row_q8_1_reference(const float * restrict x, block_q8_1 * restrict y, int k);
+
 void quantize_row_q2_K_reference(const float * restrict x, block_q2_K * restrict y, int k);
 void quantize_row_q3_K_reference(const float * restrict x, block_q3_K * restrict y, int k);
 void quantize_row_q4_K_reference(const float * restrict x, block_q4_K * restrict y, int k);
@@ -134,6 +181,13 @@ void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict
 void quantize_row_q6_K_reference(const float * restrict x, block_q6_K * restrict y, int k);
 void quantize_row_q8_K_reference(const float * restrict x, block_q8_K * restrict y, int k);
 
+void quantize_row_q4_0(const float * restrict x, void * restrict y, int k);
+void quantize_row_q4_1(const float * restrict x, void * restrict y, int k);
+void quantize_row_q5_0(const float * restrict x, void * restrict y, int k);
+void quantize_row_q5_1(const float * restrict x, void * restrict y, int k);
+void quantize_row_q8_0(const float * restrict x, void * restrict y, int k);
+void quantize_row_q8_1(const float * restrict x, void * restrict y, int k);
+
 void quantize_row_q2_K(const float * restrict x, void * restrict y, int k);
 void quantize_row_q3_K(const float * restrict x, void * restrict y, int k);
 void quantize_row_q4_K(const float * restrict x, void * restrict y, int k);
@@ -142,6 +196,13 @@ void quantize_row_q6_K(const float * restrict x, void * restrict y, int k);
 void quantize_row_q8_K(const float * restrict x, void * restrict y, int k);
 
 // Dequantization
+void dequantize_row_q4_0(const block_q4_0 * restrict x, float * restrict y, int k);
+void dequantize_row_q4_1(const block_q4_1 * restrict x, float * restrict y, int k);
+void dequantize_row_q5_0(const block_q5_0 * restrict x, float * restrict y, int k);
+void dequantize_row_q5_1(const block_q5_1 * restrict x, float * restrict y, int k);
+void dequantize_row_q8_0(const block_q8_0 * restrict x, float * restrict y, int k);
+//void dequantize_row_q8_1(const block_q8_1 * restrict x, float * restrict y, int k);
+
 void dequantize_row_q2_K(const block_q2_K * restrict x, float * restrict y, int k);
 void dequantize_row_q3_K(const block_q3_K * restrict x, float * restrict y, int k);
 void dequantize_row_q4_K(const block_q4_K * restrict x, float * restrict y, int k);
@@ -150,16 +211,14 @@ void dequantize_row_q6_K(const block_q6_K * restrict x, float * restrict y, int
 void dequantize_row_q8_K(const block_q8_K * restrict x, float * restrict y, int k);
 
 // Dot product
+void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
+void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
+void ggml_vec_dot_q5_0_q8_0(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
+void ggml_vec_dot_q5_1_q8_1(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
+void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
+
 void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
 void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
 void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
 void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
 void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
-
-// Quantization with histogram collection
-size_t ggml_quantize_q2_K(const float * src, void * dst, int n, int k, int64_t * hist);
-size_t ggml_quantize_q3_K(const float * src, void * dst, int n, int k, int64_t * hist);
-size_t ggml_quantize_q4_K(const float * src, void * dst, int n, int k, int64_t * hist);
-size_t ggml_quantize_q5_K(const float * src, void * dst, int n, int k, int64_t * hist);
-size_t ggml_quantize_q6_K(const float * src, void * dst, int n, int k, int64_t * hist);
-
diff --git a/ggml.c b/ggml.c
index 1f5598fa6..605a27940 100644
--- a/ggml.c
+++ b/ggml.c
@@ -1,10 +1,8 @@
 #define _CRT_SECURE_NO_DEPRECATE // Disables ridiculous "unsafe" warnigns on Windows
+#define _USE_MATH_DEFINES // For M_PI on MSVC
 
-#include "ggml.h"
-
-#ifdef GGML_USE_K_QUANTS
-#include "k_quants.h"
-#endif
+#include "ggml-impl.h"
+#include "ggml-quants.h"
 
 #if defined(_MSC_VER) || defined(__MINGW32__)
 #include <malloc.h> // using malloc.h with MSC/MINGW
@@ -30,18 +28,6 @@
 #include <unistd.h>
 #endif
 
-// static_assert should be a #define, but if it's not,
-// fall back to the _Static_assert C11 keyword.
-// if C99 - static_assert is noop
-// ref: https://stackoverflow.com/a/53923785/4039976
-#ifndef static_assert
-#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201100L)
-#define static_assert(cond, msg) _Static_assert(cond, msg)
-#else
-#define static_assert(cond, msg) struct global_scope_noop_trick
-#endif
-#endif
-
 #if defined(_MSC_VER)
 // disable "possible loss of data" to avoid hundreds of casts
 // we should just be careful :)
@@ -109,23 +95,11 @@ typedef void * thread_ret_t;
 #include <unistd.h>
 
 #endif
+
 #ifdef GGML_USE_CPU_HBM
 #include <hbwmalloc.h>
 #endif
 
-// __FMA__ and __F16C__ are not defined in MSVC, however they are implied with AVX2/AVX512
-#if defined(_MSC_VER) && (defined(__AVX2__) || defined(__AVX512F__))
-#ifndef __FMA__
-#define __FMA__
-#endif
-#ifndef __F16C__
-#define __F16C__
-#endif
-#ifndef __SSE3__
-#define __SSE3__
-#endif
-#endif
-
 /*#define GGML_PERF*/
 #define GGML_DEBUG 0
 #define GGML_GELU_FP16
@@ -251,228 +225,27 @@ inline static void * ggml_aligned_malloc(size_t size) {
 #include "ggml-opencl.h"
 #endif
 
-#undef MIN
-#undef MAX
-#define MIN(a, b) ((a) < (b) ? (a) : (b))
-#define MAX(a, b) ((a) > (b) ? (a) : (b))
-
 // floating point type used to accumulate sums
 typedef double ggml_float;
 
-// 16-bit float
-// on Arm, we use __fp16
-// on x86, we use uint16_t
-#if defined(__ARM_NEON) && !defined(_MSC_VER)
-
-// if YCM cannot find <arm_neon.h>, make a symbolic link to it, for example:
-//
-//   $ ln -sfn /Library/Developer/CommandLineTools/usr/lib/clang/13.1.6/include/arm_neon.h ./src/
-//
-#include <arm_neon.h>
-
-#define GGML_COMPUTE_FP16_TO_FP32(x) ((float) (x))
-#define GGML_COMPUTE_FP32_TO_FP16(x) (x)
-
-#define GGML_FP16_TO_FP32(x) ((float) (x))
-#define GGML_FP32_TO_FP16(x) (x)
-
-#else
-
-#ifdef __wasm_simd128__
-#include <wasm_simd128.h>
-#else
-#ifdef __POWER9_VECTOR__
-#include <altivec.h>
-#undef bool
-#define bool _Bool
-#else
-#if defined(_MSC_VER) || defined(__MINGW32__)
-#include <intrin.h>
-#else
-#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__) || defined(__SSE3__)
-#if !defined(__riscv)
-#include <immintrin.h>
-#endif
-#endif
-#endif
-#endif
-#endif
-
-#ifdef __riscv_v_intrinsic
-#include <riscv_vector.h>
-#endif
-
-#ifdef __F16C__
-
-#ifdef _MSC_VER
-#define GGML_COMPUTE_FP16_TO_FP32(x) _mm_cvtss_f32(_mm_cvtph_ps(_mm_cvtsi32_si128(x)))
-#define GGML_COMPUTE_FP32_TO_FP16(x) _mm_extract_epi16(_mm_cvtps_ph(_mm_set_ss(x), 0), 0)
-#else
-#define GGML_COMPUTE_FP16_TO_FP32(x) _cvtsh_ss(x)
-#define GGML_COMPUTE_FP32_TO_FP16(x) _cvtss_sh(x, 0)
-#endif
-
-#elif defined(__POWER9_VECTOR__)
-
-#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
-#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
-/* the inline asm below is about 12% faster than the lookup method */
-#define GGML_FP16_TO_FP32(x) GGML_COMPUTE_FP16_TO_FP32(x)
-#define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x)
-
-static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
-    register float f;
-    register double d;
-    __asm__(
-        "mtfprd %0,%2\n"
-        "xscvhpdp %0,%0\n"
-        "frsp %1,%0\n" :
-        /* temp */ "=d"(d),
-        /* out */  "=f"(f):
-        /* in */   "r"(h));
-    return f;
-}
-
-static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
-    register double d;
-    register ggml_fp16_t r;
-    __asm__( /* xscvdphp can work on double or single precision */
-        "xscvdphp %0,%2\n"
-        "mffprd %1,%0\n" :
-        /* temp */ "=d"(d),
-        /* out */  "=r"(r):
-        /* in */   "f"(f));
-    return r;
-}
-
-#else
-
-// FP16 <-> FP32
-// ref: https://github.com/Maratyszcza/FP16
-
-static inline float fp32_from_bits(uint32_t w) {
-    union {
-        uint32_t as_bits;
-        float as_value;
-    } fp32;
-    fp32.as_bits = w;
-    return fp32.as_value;
-}
-
-static inline uint32_t fp32_to_bits(float f) {
-    union {
-        float as_value;
-        uint32_t as_bits;
-    } fp32;
-    fp32.as_value = f;
-    return fp32.as_bits;
-}
-
-static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
-    const uint32_t w = (uint32_t) h << 16;
-    const uint32_t sign = w & UINT32_C(0x80000000);
-    const uint32_t two_w = w + w;
-
-    const uint32_t exp_offset = UINT32_C(0xE0) << 23;
-#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__)
-    const float exp_scale = 0x1.0p-112f;
-#else
-    const float exp_scale = fp32_from_bits(UINT32_C(0x7800000));
-#endif
-    const float normalized_value = fp32_from_bits((two_w >> 4) + exp_offset) * exp_scale;
-
-    const uint32_t magic_mask = UINT32_C(126) << 23;
-    const float magic_bias = 0.5f;
-    const float denormalized_value = fp32_from_bits((two_w >> 17) | magic_mask) - magic_bias;
-
-    const uint32_t denormalized_cutoff = UINT32_C(1) << 27;
-    const uint32_t result = sign |
-        (two_w < denormalized_cutoff ? fp32_to_bits(denormalized_value) : fp32_to_bits(normalized_value));
-    return fp32_from_bits(result);
-}
-
-static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
-#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__)
-    const float scale_to_inf = 0x1.0p+112f;
-    const float scale_to_zero = 0x1.0p-110f;
-#else
-    const float scale_to_inf = fp32_from_bits(UINT32_C(0x77800000));
-    const float scale_to_zero = fp32_from_bits(UINT32_C(0x08800000));
-#endif
-    float base = (fabsf(f) * scale_to_inf) * scale_to_zero;
-
-    const uint32_t w = fp32_to_bits(f);
-    const uint32_t shl1_w = w + w;
-    const uint32_t sign = w & UINT32_C(0x80000000);
-    uint32_t bias = shl1_w & UINT32_C(0xFF000000);
-    if (bias < UINT32_C(0x71000000)) {
-        bias = UINT32_C(0x71000000);
-    }
-
-    base = fp32_from_bits((bias >> 1) + UINT32_C(0x07800000)) + base;
-    const uint32_t bits = fp32_to_bits(base);
-    const uint32_t exp_bits = (bits >> 13) & UINT32_C(0x00007C00);
-    const uint32_t mantissa_bits = bits & UINT32_C(0x00000FFF);
-    const uint32_t nonsign = exp_bits + mantissa_bits;
-    return (sign >> 16) | (shl1_w > UINT32_C(0xFF000000) ? UINT16_C(0x7E00) : nonsign);
-}
-
-#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
-#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
-
-#endif // __F16C__
-
-#endif // __ARM_NEON
-
 //
 // global data
 //
 
 // precomputed gelu table for f16 (128 KB)
-static ggml_fp16_t table_gelu_f16[1 << 16];
+static ggml_fp16_t ggml_table_gelu_f16[1 << 16];
 
 // precomputed quick gelu table for f16 (128 KB)
-static ggml_fp16_t table_gelu_quick_f16[1 << 16];
+static ggml_fp16_t ggml_table_gelu_quick_f16[1 << 16];
 
 // precomputed silu table for f16 (128 KB)
-static ggml_fp16_t table_silu_f16[1 << 16];
+static ggml_fp16_t ggml_table_silu_f16[1 << 16];
 
 // precomputed exp table for f16 (128 KB)
-static ggml_fp16_t table_exp_f16[1 << 16];
+static ggml_fp16_t ggml_table_exp_f16[1 << 16];
 
-// precomputed f32 table for f16 (256 KB)
-static float table_f32_f16[1 << 16];
-
-#if defined(__ARM_NEON) || defined(__wasm_simd128__)
-#define B1(c,s,n)  0x ## n ## c ,  0x ## n ## s
-#define B2(c,s,n) B1(c,s,n ## c), B1(c,s,n ## s)
-#define B3(c,s,n) B2(c,s,n ## c), B2(c,s,n ## s)
-#define B4(c,s,n) B3(c,s,n ## c), B3(c,s,n ## s)
-#define B5(c,s,n) B4(c,s,n ## c), B4(c,s,n ## s)
-#define B6(c,s,n) B5(c,s,n ## c), B5(c,s,n ## s)
-#define B7(c,s,n) B6(c,s,n ## c), B6(c,s,n ## s)
-#define B8(c,s  ) B7(c,s,     c), B7(c,s,     s)
-
-// precomputed tables for expanding 8bits to 8 bytes:
-static const uint64_t table_b2b_0[1 << 8] = { B8(00, 10) }; // ( b) << 4
-static const uint64_t table_b2b_1[1 << 8] = { B8(10, 00) }; // (!b) << 4
-#endif
-
-// On ARM NEON, it's quicker to directly convert x -> x instead of calling into ggml_lookup_fp16_to_fp32,
-// so we define GGML_FP16_TO_FP32 and GGML_FP32_TO_FP16 elsewhere for NEON.
-// This is also true for POWER9.
-#if !defined(GGML_FP16_TO_FP32) || !defined(GGML_FP32_TO_FP16)
-
-inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) {
-    uint16_t s;
-    memcpy(&s, &f, sizeof(uint16_t));
-    return table_f32_f16[s];
-}
-
-#define GGML_FP16_TO_FP32(x) ggml_lookup_fp16_to_fp32(x)
-#define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x)
-
-#endif
+// precomputed f32 table for f16 (256 KB) (ggml-impl.h)
+float ggml_table_f32_f16[1 << 16];
 
 // note: do not use these inside ggml.c
 // these are meant to be used via the ggml.h API
@@ -571,7 +344,6 @@ int64_t ggml_cycles_per_ms(void) {
 #define ggml_perf_cycles_per_ms() 0
 #endif
 
-
 //
 // cache line
 //
@@ -588,1071 +360,8 @@ int64_t ggml_cycles_per_ms(void) {
 
 static const size_t CACHE_LINE_SIZE_F32 = CACHE_LINE_SIZE/sizeof(float);
 
-//
-// quantization
-//
-
-#define MM256_SET_M128I(a, b) _mm256_insertf128_si256(_mm256_castsi128_si256(b), (a), 1)
-
-#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__)
-// multiply int8_t, add results pairwise twice
-static inline __m128i mul_sum_i8_pairs(const __m128i x, const __m128i y) {
-    // Get absolute values of x vectors
-    const __m128i ax = _mm_sign_epi8(x, x);
-    // Sign the values of the y vectors
-    const __m128i sy = _mm_sign_epi8(y, x);
-    // Perform multiplication and create 16-bit values
-    const __m128i dot = _mm_maddubs_epi16(ax, sy);
-    const __m128i ones = _mm_set1_epi16(1);
-    return _mm_madd_epi16(ones, dot);
-}
-
-#if __AVX__ || __AVX2__ || __AVX512F__
-// horizontally add 8 floats
-static inline float hsum_float_8(const __m256 x) {
-    __m128 res = _mm256_extractf128_ps(x, 1);
-    res = _mm_add_ps(res, _mm256_castps256_ps128(x));
-    res = _mm_add_ps(res, _mm_movehl_ps(res, res));
-    res = _mm_add_ss(res, _mm_movehdup_ps(res));
-    return _mm_cvtss_f32(res);
-}
-
-// horizontally add 8 int32_t
-static inline int hsum_i32_8(const __m256i a) {
-    const __m128i sum128 = _mm_add_epi32(_mm256_castsi256_si128(a), _mm256_extractf128_si256(a, 1));
-    const __m128i hi64 = _mm_unpackhi_epi64(sum128, sum128);
-    const __m128i sum64 = _mm_add_epi32(hi64, sum128);
-    const __m128i hi32  = _mm_shuffle_epi32(sum64, _MM_SHUFFLE(2, 3, 0, 1));
-    return _mm_cvtsi128_si32(_mm_add_epi32(sum64, hi32));
-}
-
-// horizontally add 4 int32_t
-static inline int hsum_i32_4(const __m128i a) {
-    const __m128i hi64 = _mm_unpackhi_epi64(a, a);
-    const __m128i sum64 = _mm_add_epi32(hi64, a);
-    const __m128i hi32  = _mm_shuffle_epi32(sum64, _MM_SHUFFLE(2, 3, 0, 1));
-    return _mm_cvtsi128_si32(_mm_add_epi32(sum64, hi32));
-}
-
-#if defined(__AVX2__) || defined(__AVX512F__)
-// spread 32 bits to 32 bytes { 0x00, 0xFF }
-static inline __m256i bytes_from_bits_32(const uint8_t * x) {
-    uint32_t x32;
-    memcpy(&x32, x, sizeof(uint32_t));
-    const __m256i shuf_mask = _mm256_set_epi64x(
-            0x0303030303030303, 0x0202020202020202,
-            0x0101010101010101, 0x0000000000000000);
-    __m256i bytes = _mm256_shuffle_epi8(_mm256_set1_epi32(x32), shuf_mask);
-    const __m256i bit_mask = _mm256_set1_epi64x(0x7fbfdfeff7fbfdfe);
-    bytes = _mm256_or_si256(bytes, bit_mask);
-    return _mm256_cmpeq_epi8(bytes, _mm256_set1_epi64x(-1));
-}
-
-// Unpack 32 4-bit fields into 32 bytes
-// The output vector contains 32 bytes, each one in [ 0 .. 15 ] interval
-static inline __m256i bytes_from_nibbles_32(const uint8_t * rsi)
-{
-    const __m128i tmp = _mm_loadu_si128((const __m128i *)rsi);
-    const __m256i bytes = MM256_SET_M128I(_mm_srli_epi16(tmp, 4), tmp);
-    const __m256i lowMask = _mm256_set1_epi8( 0xF );
-    return _mm256_and_si256(lowMask, bytes);
-}
-
-// add int16_t pairwise and return as float vector
-static inline __m256 sum_i16_pairs_float(const __m256i x) {
-    const __m256i ones = _mm256_set1_epi16(1);
-    const __m256i summed_pairs = _mm256_madd_epi16(ones, x);
-    return _mm256_cvtepi32_ps(summed_pairs);
-}
-
-static inline __m256 mul_sum_us8_pairs_float(const __m256i ax, const __m256i sy) {
-#if __AVXVNNI__
-    const __m256i zero = _mm256_setzero_si256();
-    const __m256i summed_pairs = _mm256_dpbusd_epi32(zero, ax, sy);
-    return _mm256_cvtepi32_ps(summed_pairs);
-#else
-    // Perform multiplication and create 16-bit values
-    const __m256i dot = _mm256_maddubs_epi16(ax, sy);
-    return sum_i16_pairs_float(dot);
-#endif
-}
-
-// multiply int8_t, add results pairwise twice and return as float vector
-static inline __m256 mul_sum_i8_pairs_float(const __m256i x, const __m256i y) {
-#if __AVXVNNIINT8__
-    const __m256i zero = _mm256_setzero_si256();
-    const __m256i summed_pairs = _mm256_dpbssd_epi32(zero, x, y);
-    return _mm256_cvtepi32_ps(summed_pairs);
-#else
-    // Get absolute values of x vectors
-    const __m256i ax = _mm256_sign_epi8(x, x);
-    // Sign the values of the y vectors
-    const __m256i sy = _mm256_sign_epi8(y, x);
-    return mul_sum_us8_pairs_float(ax, sy);
-#endif
-}
-
-static inline __m128i packNibbles( __m256i bytes )
-{
-    // Move bits within 16-bit lanes from 0000_abcd_0000_efgh into 0000_0000_abcd_efgh
-#if __AVX512F__
-    const __m256i bytes_srli_4 = _mm256_srli_epi16(bytes, 4);   // 0000_0000_abcd_0000
-    bytes = _mm256_or_si256(bytes, bytes_srli_4);               // 0000_abcd_abcd_efgh
-    return _mm256_cvtepi16_epi8(bytes);                         // abcd_efgh
-#else
-    const __m256i lowByte = _mm256_set1_epi16( 0xFF );
-    __m256i high = _mm256_andnot_si256( lowByte, bytes );
-    __m256i low = _mm256_and_si256( lowByte, bytes );
-    high = _mm256_srli_epi16( high, 4 );
-    bytes = _mm256_or_si256( low, high );
-
-    // Compress uint16_t lanes into bytes
-    __m128i r0 = _mm256_castsi256_si128( bytes );
-    __m128i r1 = _mm256_extracti128_si256( bytes, 1 );
-    return _mm_packus_epi16( r0, r1 );
-#endif
-}
-#elif defined(__AVX__)
-// spread 32 bits to 32 bytes { 0x00, 0xFF }
-static inline __m256i bytes_from_bits_32(const uint8_t * x) {
-    uint32_t x32;
-    memcpy(&x32, x, sizeof(uint32_t));
-    const __m128i shuf_maskl = _mm_set_epi64x(0x0101010101010101, 0x0000000000000000);
-    const __m128i shuf_maskh = _mm_set_epi64x(0x0303030303030303, 0x0202020202020202);
-    __m128i bytesl = _mm_shuffle_epi8(_mm_set1_epi32(x32), shuf_maskl);
-    __m128i bytesh = _mm_shuffle_epi8(_mm_set1_epi32(x32), shuf_maskh);
-    const __m128i bit_mask = _mm_set1_epi64x(0x7fbfdfeff7fbfdfe);
-    bytesl = _mm_or_si128(bytesl, bit_mask);
-    bytesh = _mm_or_si128(bytesh, bit_mask);
-    bytesl = _mm_cmpeq_epi8(bytesl, _mm_set1_epi64x(-1));
-    bytesh = _mm_cmpeq_epi8(bytesh, _mm_set1_epi64x(-1));
-    return MM256_SET_M128I(bytesh, bytesl);
-}
-
-// Unpack 32 4-bit fields into 32 bytes
-// The output vector contains 32 bytes, each one in [ 0 .. 15 ] interval
-static inline __m256i bytes_from_nibbles_32(const uint8_t * rsi)
-{
-    // Load 16 bytes from memory
-    __m128i tmpl = _mm_loadu_si128((const __m128i *)rsi);
-    __m128i tmph = _mm_srli_epi16(tmpl, 4);
-    const __m128i lowMask = _mm_set1_epi8(0xF);
-    tmpl = _mm_and_si128(lowMask, tmpl);
-    tmph = _mm_and_si128(lowMask, tmph);
-    return MM256_SET_M128I(tmph, tmpl);
-}
-
-// add int16_t pairwise and return as float vector
-static inline __m256 sum_i16_pairs_float(const __m128i xh, const __m128i xl) {
-    const __m128i ones = _mm_set1_epi16(1);
-    const __m128i summed_pairsl = _mm_madd_epi16(ones, xl);
-    const __m128i summed_pairsh = _mm_madd_epi16(ones, xh);
-    const __m256i summed_pairs = MM256_SET_M128I(summed_pairsh, summed_pairsl);
-    return _mm256_cvtepi32_ps(summed_pairs);
-}
-
-static inline __m256 mul_sum_us8_pairs_float(const __m256i ax, const __m256i sy) {
-    const __m128i axl = _mm256_castsi256_si128(ax);
-    const __m128i axh = _mm256_extractf128_si256(ax, 1);
-    const __m128i syl = _mm256_castsi256_si128(sy);
-    const __m128i syh = _mm256_extractf128_si256(sy, 1);
-    // Perform multiplication and create 16-bit values
-    const __m128i dotl = _mm_maddubs_epi16(axl, syl);
-    const __m128i doth = _mm_maddubs_epi16(axh, syh);
-    return sum_i16_pairs_float(doth, dotl);
-}
-
-// multiply int8_t, add results pairwise twice and return as float vector
-static inline __m256 mul_sum_i8_pairs_float(const __m256i x, const __m256i y) {
-    const __m128i xl = _mm256_castsi256_si128(x);
-    const __m128i xh = _mm256_extractf128_si256(x, 1);
-    const __m128i yl = _mm256_castsi256_si128(y);
-    const __m128i yh = _mm256_extractf128_si256(y, 1);
-    // Get absolute values of x vectors
-    const __m128i axl = _mm_sign_epi8(xl, xl);
-    const __m128i axh = _mm_sign_epi8(xh, xh);
-    // Sign the values of the y vectors
-    const __m128i syl = _mm_sign_epi8(yl, xl);
-    const __m128i syh = _mm_sign_epi8(yh, xh);
-    // Perform multiplication and create 16-bit values
-    const __m128i dotl = _mm_maddubs_epi16(axl, syl);
-    const __m128i doth = _mm_maddubs_epi16(axh, syh);
-    return sum_i16_pairs_float(doth, dotl);
-}
-
-static inline __m128i packNibbles( __m128i bytes1, __m128i bytes2 )
-{
-    // Move bits within 16-bit lanes from 0000_abcd_0000_efgh into 0000_0000_abcd_efgh
-    const __m128i lowByte = _mm_set1_epi16( 0xFF );
-    __m128i high = _mm_andnot_si128( lowByte, bytes1 );
-    __m128i low = _mm_and_si128( lowByte, bytes1 );
-    high = _mm_srli_epi16( high, 4 );
-    bytes1 = _mm_or_si128( low, high );
-    high = _mm_andnot_si128( lowByte, bytes2 );
-    low = _mm_and_si128( lowByte, bytes2 );
-    high = _mm_srli_epi16( high, 4 );
-    bytes2 = _mm_or_si128( low, high );
-
-    return _mm_packus_epi16( bytes1, bytes2);
-}
-#endif
-#elif defined(__SSSE3__)
-// horizontally add 4x4 floats
-static inline float hsum_float_4x4(const __m128 a, const __m128 b, const __m128 c, const __m128 d) {
-    __m128 res_0 =_mm_hadd_ps(a, b);
-    __m128 res_1 =_mm_hadd_ps(c, d);
-    __m128 res =_mm_hadd_ps(res_0, res_1);
-    res =_mm_hadd_ps(res, res);
-    res =_mm_hadd_ps(res, res);
-
-    return _mm_cvtss_f32(res);
-}
-#endif // __AVX__ || __AVX2__ || __AVX512F__
-#endif // defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__)
-
-#if defined(__ARM_NEON)
-
-#if !defined(__aarch64__)
-
-inline static int32_t vaddvq_s32(int32x4_t v) {
-    return vgetq_lane_s32(v, 0) + vgetq_lane_s32(v, 1) + vgetq_lane_s32(v, 2) + vgetq_lane_s32(v, 3);
-}
-
-inline static float vaddvq_f32(float32x4_t v) {
-    return vgetq_lane_f32(v, 0) + vgetq_lane_f32(v, 1) + vgetq_lane_f32(v, 2) + vgetq_lane_f32(v, 3);
-}
-
-inline static float vmaxvq_f32(float32x4_t v) {
-    return
-        MAX(MAX(vgetq_lane_f32(v, 0), vgetq_lane_f32(v, 1)),
-            MAX(vgetq_lane_f32(v, 2), vgetq_lane_f32(v, 3)));
-}
-
-inline static int32x4_t vcvtnq_s32_f32(float32x4_t v) {
-    int32x4_t res;
-
-    res[0] = roundf(vgetq_lane_f32(v, 0));
-    res[1] = roundf(vgetq_lane_f32(v, 1));
-    res[2] = roundf(vgetq_lane_f32(v, 2));
-    res[3] = roundf(vgetq_lane_f32(v, 3));
-
-    return res;
-}
-
-#endif
-#endif
-
-#define QK4_0 32
-typedef struct {
-    ggml_fp16_t d;          // delta
-    uint8_t qs[QK4_0 / 2];  // nibbles / quants
-} block_q4_0;
-static_assert(sizeof(block_q4_0) == sizeof(ggml_fp16_t) + QK4_0 / 2, "wrong q4_0 block size/padding");
-
-#define QK4_1 32
-typedef struct {
-    ggml_fp16_t d;          // delta
-    ggml_fp16_t m;          // min
-    uint8_t qs[QK4_1 / 2];  // nibbles / quants
-} block_q4_1;
-static_assert(sizeof(block_q4_1) == 2 * sizeof(ggml_fp16_t) + QK4_1 / 2, "wrong q4_1 block size/padding");
-
-#define QK5_0 32
-typedef struct {
-    ggml_fp16_t d;         // delta
-    uint8_t qh[4];         // 5-th bit of quants
-    uint8_t qs[QK5_0 / 2]; // nibbles / quants
-} block_q5_0;
-static_assert(sizeof(block_q5_0) == sizeof(ggml_fp16_t) + sizeof(uint32_t) + QK5_0 / 2, "wrong q5_0 block size/padding");
-
-#define QK5_1 32
-typedef struct {
-    ggml_fp16_t d;         // delta
-    ggml_fp16_t m;         // min
-    uint8_t qh[4];         // 5-th bit of quants
-    uint8_t qs[QK5_1 / 2]; // nibbles / quants
-} block_q5_1;
-static_assert(sizeof(block_q5_1) == 2 * sizeof(ggml_fp16_t) + sizeof(uint32_t) + QK5_1 / 2, "wrong q5_1 block size/padding");
-
-#define QK8_0 32
-typedef struct {
-    ggml_fp16_t d;         // delta
-    int8_t  qs[QK8_0];     // quants
-} block_q8_0;
-static_assert(sizeof(block_q8_0) == sizeof(ggml_fp16_t) + QK8_0, "wrong q8_0 block size/padding");
-
-#define QK8_1 32
-typedef struct {
-    float d;               // delta
-    float s;               // d * sum(qs[i])
-    int8_t  qs[QK8_1];     // quants
-} block_q8_1;
-static_assert(sizeof(block_q8_1) == 2*sizeof(float) + QK8_1, "wrong q8_1 block size/padding");
-
-// reference implementation for deterministic creation of model files
-static void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * restrict y, int k) {
-    static const int qk = QK4_0;
-
-    assert(k % qk == 0);
-
-    const int nb = k / qk;
-
-    for (int i = 0; i < nb; i++) {
-        float amax = 0.0f; // absolute max
-        float max  = 0.0f;
-
-        for (int j = 0; j < qk; j++) {
-            const float v = x[i*qk + j];
-            if (amax < fabsf(v)) {
-                amax = fabsf(v);
-                max  = v;
-            }
-        }
-
-        const float d  = max / -8;
-        const float id = d ? 1.0f/d : 0.0f;
-
-        y[i].d = GGML_FP32_TO_FP16(d);
-
-        for (int j = 0; j < qk/2; ++j) {
-            const float x0 = x[i*qk + 0    + j]*id;
-            const float x1 = x[i*qk + qk/2 + j]*id;
-
-            const uint8_t xi0 = MIN(15, (int8_t)(x0 + 8.5f));
-            const uint8_t xi1 = MIN(15, (int8_t)(x1 + 8.5f));
-
-            y[i].qs[j]  = xi0;
-            y[i].qs[j] |= xi1 << 4;
-        }
-    }
-}
-
-static void quantize_row_q4_0(const float * restrict x, void * restrict y, int k) {
-    quantize_row_q4_0_reference(x, y, k);
-}
-
-static void quantize_row_q4_1_reference(const float * restrict x, block_q4_1 * restrict y, int k) {
-    const int qk = QK4_1;
-
-    assert(k % qk == 0);
-
-    const int nb = k / qk;
-
-    for (int i = 0; i < nb; i++) {
-        float min = FLT_MAX;
-        float max = -FLT_MAX;
-
-        for (int j = 0; j < qk; j++) {
-            const float v = x[i*qk + j];
-
-            if (v < min) min = v;
-            if (v > max) max = v;
-        }
-
-        const float d  = (max - min) / ((1 << 4) - 1);
-        const float id = d ? 1.0f/d : 0.0f;
-
-        y[i].d = GGML_FP32_TO_FP16(d);
-        y[i].m = GGML_FP32_TO_FP16(min);
-
-        for (int j = 0; j < qk/2; ++j) {
-            const float x0 = (x[i*qk + 0    + j] - min)*id;
-            const float x1 = (x[i*qk + qk/2 + j] - min)*id;
-
-            const uint8_t xi0 = MIN(15, (int8_t)(x0 + 0.5f));
-            const uint8_t xi1 = MIN(15, (int8_t)(x1 + 0.5f));
-
-            y[i].qs[j]  = xi0;
-            y[i].qs[j] |= xi1 << 4;
-        }
-    }
-}
-
-static void quantize_row_q4_1(const float * restrict x, void * restrict y, int k) {
-    quantize_row_q4_1_reference(x, y, k);
-}
-
-static void quantize_row_q5_0_reference(const float * restrict x, block_q5_0 * restrict y, int k) {
-    static const int qk = QK5_0;
-
-    assert(k % qk == 0);
-
-    const int nb = k / qk;
-
-    for (int i = 0; i < nb; i++) {
-        float amax = 0.0f; // absolute max
-        float max  = 0.0f;
-
-        for (int j = 0; j < qk; j++) {
-            const float v = x[i*qk + j];
-            if (amax < fabsf(v)) {
-                amax = fabsf(v);
-                max  = v;
-            }
-        }
-
-        const float d  = max / -16;
-        const float id = d ? 1.0f/d : 0.0f;
-
-        y[i].d = GGML_FP32_TO_FP16(d);
-
-        uint32_t qh = 0;
-
-        for (int j = 0; j < qk/2; ++j) {
-            const float x0 = x[i*qk + 0    + j]*id;
-            const float x1 = x[i*qk + qk/2 + j]*id;
-
-            const uint8_t xi0 = MIN(31, (int8_t)(x0 + 16.5f));
-            const uint8_t xi1 = MIN(31, (int8_t)(x1 + 16.5f));
-
-            y[i].qs[j] = (xi0 & 0x0F) | ((xi1 & 0x0F) << 4);
-
-            // get the 5-th bit and store it in qh at the right position
-            qh |= ((xi0 & 0x10u) >> 4) << (j + 0);
-            qh |= ((xi1 & 0x10u) >> 4) << (j + qk/2);
-        }
-
-        memcpy(&y[i].qh, &qh, sizeof(qh));
-    }
-}
-
-static void quantize_row_q5_0(const float * restrict x, void * restrict y, int k) {
-    quantize_row_q5_0_reference(x, y, k);
-}
-
-static void quantize_row_q5_1_reference(const float * restrict x, block_q5_1 * restrict y, int k) {
-    const int qk = QK5_1;
-
-    assert(k % qk == 0);
-
-    const int nb = k / qk;
-
-    for (int i = 0; i < nb; i++) {
-        float min = FLT_MAX;
-        float max = -FLT_MAX;
-
-        for (int j = 0; j < qk; j++) {
-            const float v = x[i*qk + j];
-
-            if (v < min) min = v;
-            if (v > max) max = v;
-        }
-
-        const float d  = (max - min) / ((1 << 5) - 1);
-        const float id = d ? 1.0f/d : 0.0f;
-
-        y[i].d = GGML_FP32_TO_FP16(d);
-        y[i].m = GGML_FP32_TO_FP16(min);
-
-        uint32_t qh = 0;
-
-        for (int j = 0; j < qk/2; ++j) {
-            const float x0 = (x[i*qk + 0    + j] - min)*id;
-            const float x1 = (x[i*qk + qk/2 + j] - min)*id;
-
-            const uint8_t xi0 = (uint8_t)(x0 + 0.5f);
-            const uint8_t xi1 = (uint8_t)(x1 + 0.5f);
-
-            y[i].qs[j] = (xi0 & 0x0F) | ((xi1 & 0x0F) << 4);
-
-            // get the 5-th bit and store it in qh at the right position
-            qh |= ((xi0 & 0x10u) >> 4) << (j + 0);
-            qh |= ((xi1 & 0x10u) >> 4) << (j + qk/2);
-        }
-
-        memcpy(&y[i].qh, &qh, sizeof(y[i].qh));
-    }
-}
-
-static void quantize_row_q5_1(const float * restrict x, void * restrict y, int k) {
-    quantize_row_q5_1_reference(x, y, k);
-}
-
-// reference implementation for deterministic creation of model files
-static void quantize_row_q8_0_reference(const float * restrict x, block_q8_0 * restrict y, int k) {
-    assert(k % QK8_0 == 0);
-    const int nb = k / QK8_0;
-
-    for (int i = 0; i < nb; i++) {
-        float amax = 0.0f; // absolute max
-
-        for (int j = 0; j < QK8_0; j++) {
-            const float v = x[i*QK8_0 + j];
-            amax = MAX(amax, fabsf(v));
-        }
-
-        const float d = amax / ((1 << 7) - 1);
-        const float id = d ? 1.0f/d : 0.0f;
-
-        y[i].d = GGML_FP32_TO_FP16(d);
-
-        for (int j = 0; j < QK8_0; ++j) {
-            const float x0 = x[i*QK8_0 + j]*id;
-
-            y[i].qs[j] = roundf(x0);
-        }
-    }
-}
-
-static void quantize_row_q8_0(const float * restrict x, void * restrict vy, int k) {
-    assert(QK8_0 == 32);
-    assert(k % QK8_0 == 0);
-    const int nb = k / QK8_0;
-
-    block_q8_0 * restrict y = vy;
-
-#if defined(__ARM_NEON)
-    for (int i = 0; i < nb; i++) {
-        float32x4_t srcv [8];
-        float32x4_t asrcv[8];
-        float32x4_t amaxv[8];
-
-        for (int j = 0; j < 8; j++) srcv[j]  = vld1q_f32(x + i*32 + 4*j);
-        for (int j = 0; j < 8; j++) asrcv[j] = vabsq_f32(srcv[j]);
-
-        for (int j = 0; j < 4; j++) amaxv[2*j] = vmaxq_f32(asrcv[2*j], asrcv[2*j+1]);
-        for (int j = 0; j < 2; j++) amaxv[4*j] = vmaxq_f32(amaxv[4*j], amaxv[4*j+2]);
-        for (int j = 0; j < 1; j++) amaxv[8*j] = vmaxq_f32(amaxv[8*j], amaxv[8*j+4]);
-
-        const float amax = vmaxvq_f32(amaxv[0]);
-
-        const float d = amax / ((1 << 7) - 1);
-        const float id = d ? 1.0f/d : 0.0f;
-
-        y[i].d = GGML_FP32_TO_FP16(d);
-
-        for (int j = 0; j < 8; j++) {
-            const float32x4_t v  = vmulq_n_f32(srcv[j], id);
-            const int32x4_t   vi = vcvtnq_s32_f32(v);
-
-            y[i].qs[4*j + 0] = vgetq_lane_s32(vi, 0);
-            y[i].qs[4*j + 1] = vgetq_lane_s32(vi, 1);
-            y[i].qs[4*j + 2] = vgetq_lane_s32(vi, 2);
-            y[i].qs[4*j + 3] = vgetq_lane_s32(vi, 3);
-        }
-    }
-#elif defined(__wasm_simd128__)
-    for (int i = 0; i < nb; i++) {
-        v128_t srcv [8];
-        v128_t asrcv[8];
-        v128_t amaxv[8];
-
-        for (int j = 0; j < 8; j++) srcv[j]  = wasm_v128_load(x + i*32 + 4*j);
-        for (int j = 0; j < 8; j++) asrcv[j] = wasm_f32x4_abs(srcv[j]);
-
-        for (int j = 0; j < 4; j++) amaxv[2*j] = wasm_f32x4_max(asrcv[2*j], asrcv[2*j+1]);
-        for (int j = 0; j < 2; j++) amaxv[4*j] = wasm_f32x4_max(amaxv[4*j], amaxv[4*j+2]);
-        for (int j = 0; j < 1; j++) amaxv[8*j] = wasm_f32x4_max(amaxv[8*j], amaxv[8*j+4]);
-
-        const float amax = MAX(MAX(wasm_f32x4_extract_lane(amaxv[0], 0),
-                                   wasm_f32x4_extract_lane(amaxv[0], 1)),
-                               MAX(wasm_f32x4_extract_lane(amaxv[0], 2),
-                                   wasm_f32x4_extract_lane(amaxv[0], 3)));
-
-        const float d = amax / ((1 << 7) - 1);
-        const float id = d ? 1.0f/d : 0.0f;
-
-        y[i].d = GGML_FP32_TO_FP16(d);
-
-        for (int j = 0; j < 8; j++) {
-            const v128_t v  = wasm_f32x4_mul(srcv[j], wasm_f32x4_splat(id));
-            const v128_t vi = wasm_i32x4_trunc_sat_f32x4(v);
-
-            y[i].qs[4*j + 0] = wasm_i32x4_extract_lane(vi, 0);
-            y[i].qs[4*j + 1] = wasm_i32x4_extract_lane(vi, 1);
-            y[i].qs[4*j + 2] = wasm_i32x4_extract_lane(vi, 2);
-            y[i].qs[4*j + 3] = wasm_i32x4_extract_lane(vi, 3);
-        }
-    }
-#elif defined(__AVX2__) || defined(__AVX__)
-    for (int i = 0; i < nb; i++) {
-        // Load elements into 4 AVX vectors
-        __m256 v0 = _mm256_loadu_ps( x );
-        __m256 v1 = _mm256_loadu_ps( x + 8 );
-        __m256 v2 = _mm256_loadu_ps( x + 16 );
-        __m256 v3 = _mm256_loadu_ps( x + 24 );
-        x += 32;
-
-        // Compute max(abs(e)) for the block
-        const __m256 signBit = _mm256_set1_ps( -0.0f );
-        __m256 maxAbs = _mm256_andnot_ps( signBit, v0 );
-        maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v1 ) );
-        maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v2 ) );
-        maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v3 ) );
-
-        __m128 max4 = _mm_max_ps( _mm256_extractf128_ps( maxAbs, 1 ), _mm256_castps256_ps128( maxAbs ) );
-        max4 = _mm_max_ps( max4, _mm_movehl_ps( max4, max4 ) );
-        max4 = _mm_max_ss( max4, _mm_movehdup_ps( max4 ) );
-        const float maxScalar = _mm_cvtss_f32( max4 );
-
-        // Quantize these floats
-        const float d = maxScalar / 127.f;
-        y[i].d = GGML_FP32_TO_FP16(d);
-        const float id = ( maxScalar != 0.0f ) ? 127.f / maxScalar : 0.0f;
-        const __m256 mul = _mm256_set1_ps( id );
-
-        // Apply the multiplier
-        v0 = _mm256_mul_ps( v0, mul );
-        v1 = _mm256_mul_ps( v1, mul );
-        v2 = _mm256_mul_ps( v2, mul );
-        v3 = _mm256_mul_ps( v3, mul );
-
-        // Round to nearest integer
-        v0 = _mm256_round_ps( v0, _MM_ROUND_NEAREST );
-        v1 = _mm256_round_ps( v1, _MM_ROUND_NEAREST );
-        v2 = _mm256_round_ps( v2, _MM_ROUND_NEAREST );
-        v3 = _mm256_round_ps( v3, _MM_ROUND_NEAREST );
-
-        // Convert floats to integers
-        __m256i i0 = _mm256_cvtps_epi32( v0 );
-        __m256i i1 = _mm256_cvtps_epi32( v1 );
-        __m256i i2 = _mm256_cvtps_epi32( v2 );
-        __m256i i3 = _mm256_cvtps_epi32( v3 );
-
-#if defined(__AVX2__)
-        // Convert int32 to int16
-        i0 = _mm256_packs_epi32( i0, i1 );	// 0, 1, 2, 3,  8, 9, 10, 11,  4, 5, 6, 7, 12, 13, 14, 15
-        i2 = _mm256_packs_epi32( i2, i3 );	// 16, 17, 18, 19,  24, 25, 26, 27,  20, 21, 22, 23, 28, 29, 30, 31
-                                            // Convert int16 to int8
-        i0 = _mm256_packs_epi16( i0, i2 );	// 0, 1, 2, 3,  8, 9, 10, 11,  16, 17, 18, 19,  24, 25, 26, 27,  4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31
-
-        // We got our precious signed bytes, but the order is now wrong
-        // These AVX2 pack instructions process 16-byte pieces independently
-        // The following instruction is fixing the order
-        const __m256i perm = _mm256_setr_epi32( 0, 4, 1, 5, 2, 6, 3, 7 );
-        i0 = _mm256_permutevar8x32_epi32( i0, perm );
-
-        _mm256_storeu_si256((__m256i *)y[i].qs, i0);
-#else
-        // Since we don't have in AVX some necessary functions,
-        // we split the registers in half and call AVX2 analogs from SSE
-        __m128i ni0 = _mm256_castsi256_si128( i0 );
-        __m128i ni1 = _mm256_extractf128_si256( i0, 1);
-        __m128i ni2 = _mm256_castsi256_si128( i1 );
-        __m128i ni3 = _mm256_extractf128_si256( i1, 1);
-        __m128i ni4 = _mm256_castsi256_si128( i2 );
-        __m128i ni5 = _mm256_extractf128_si256( i2, 1);
-        __m128i ni6 = _mm256_castsi256_si128( i3 );
-        __m128i ni7 = _mm256_extractf128_si256( i3, 1);
-
-        // Convert int32 to int16
-        ni0 = _mm_packs_epi32( ni0, ni1 );
-        ni2 = _mm_packs_epi32( ni2, ni3 );
-        ni4 = _mm_packs_epi32( ni4, ni5 );
-        ni6 = _mm_packs_epi32( ni6, ni7 );
-        // Convert int16 to int8
-        ni0 = _mm_packs_epi16( ni0, ni2 );
-        ni4 = _mm_packs_epi16( ni4, ni6 );
-
-        _mm_storeu_si128((__m128i *)(y[i].qs +  0), ni0);
-        _mm_storeu_si128((__m128i *)(y[i].qs + 16), ni4);
-#endif
-    }
-#elif defined(__riscv_v_intrinsic)
-
-    size_t vl = __riscv_vsetvl_e32m4(QK8_0);
-
-    for (int i = 0; i < nb; i++) {
-        // load elements
-        vfloat32m4_t v_x   = __riscv_vle32_v_f32m4(x+i*QK8_0, vl);
-
-        vfloat32m4_t vfabs = __riscv_vfabs_v_f32m4(v_x, vl);
-        vfloat32m1_t tmp   = __riscv_vfmv_v_f_f32m1(0.0f, vl);
-        vfloat32m1_t vmax  = __riscv_vfredmax_vs_f32m4_f32m1(vfabs, tmp, vl);
-        float amax = __riscv_vfmv_f_s_f32m1_f32(vmax);
-
-        const float d = amax / ((1 << 7) - 1);
-        const float id = d ? 1.0f/d : 0.0f;
-
-        y[i].d = GGML_FP32_TO_FP16(d);
-
-        vfloat32m4_t x0 = __riscv_vfmul_vf_f32m4(v_x, id, vl);
-
-        // convert to integer
-        vint16m2_t   vi = __riscv_vfncvt_x_f_w_i16m2(x0, vl);
-        vint8m1_t    vs = __riscv_vncvt_x_x_w_i8m1(vi, vl);
-
-        // store result
-        __riscv_vse8_v_i8m1(y[i].qs , vs, vl);
-    }
-#else
-    // scalar
-    quantize_row_q8_0_reference(x, y, k);
-#endif
-}
-
-// reference implementation for deterministic creation of model files
-static void quantize_row_q8_1_reference(const float * restrict x, block_q8_1 * restrict y, int k) {
-    assert(QK8_1 == 32);
-    assert(k % QK8_1 == 0);
-    const int nb = k / QK8_1;
-
-    for (int i = 0; i < nb; i++) {
-        float amax = 0.0f; // absolute max
-
-        for (int j = 0; j < QK8_1; j++) {
-            const float v = x[i*QK8_1 + j];
-            amax = MAX(amax, fabsf(v));
-        }
-
-        const float d = amax / ((1 << 7) - 1);
-        const float id = d ? 1.0f/d : 0.0f;
-
-        y[i].d = d;
-
-        int sum = 0;
-
-        for (int j = 0; j < QK8_1/2; ++j) {
-            const float v0 = x[i*QK8_1           + j]*id;
-            const float v1 = x[i*QK8_1 + QK8_1/2 + j]*id;
-
-            y[i].qs[          j] = roundf(v0);
-            y[i].qs[QK8_1/2 + j] = roundf(v1);
-
-            sum += y[i].qs[          j];
-            sum += y[i].qs[QK8_1/2 + j];
-        }
-
-        y[i].s = sum*d;
-    }
-}
-
-static void quantize_row_q8_1(const float * restrict x, void * restrict vy, int k) {
-    assert(k % QK8_1 == 0);
-    const int nb = k / QK8_1;
-
-    block_q8_1 * restrict y = vy;
-
-#if defined(__ARM_NEON)
-    for (int i = 0; i < nb; i++) {
-        float32x4_t srcv [8];
-        float32x4_t asrcv[8];
-        float32x4_t amaxv[8];
-
-        for (int j = 0; j < 8; j++) srcv[j]  = vld1q_f32(x + i*32 + 4*j);
-        for (int j = 0; j < 8; j++) asrcv[j] = vabsq_f32(srcv[j]);
-
-        for (int j = 0; j < 4; j++) amaxv[2*j] = vmaxq_f32(asrcv[2*j], asrcv[2*j+1]);
-        for (int j = 0; j < 2; j++) amaxv[4*j] = vmaxq_f32(amaxv[4*j], amaxv[4*j+2]);
-        for (int j = 0; j < 1; j++) amaxv[8*j] = vmaxq_f32(amaxv[8*j], amaxv[8*j+4]);
-
-        const float amax = vmaxvq_f32(amaxv[0]);
-
-        const float d = amax / ((1 << 7) - 1);
-        const float id = d ? 1.0f/d : 0.0f;
-
-        y[i].d = d;
-
-        int32x4_t accv = vdupq_n_s32(0);
-
-        for (int j = 0; j < 8; j++) {
-            const float32x4_t v  = vmulq_n_f32(srcv[j], id);
-            const int32x4_t   vi = vcvtnq_s32_f32(v);
-
-            y[i].qs[4*j + 0] = vgetq_lane_s32(vi, 0);
-            y[i].qs[4*j + 1] = vgetq_lane_s32(vi, 1);
-            y[i].qs[4*j + 2] = vgetq_lane_s32(vi, 2);
-            y[i].qs[4*j + 3] = vgetq_lane_s32(vi, 3);
-
-            accv = vaddq_s32(accv, vi);
-        }
-
-        y[i].s = d * vaddvq_s32(accv);
-    }
-#elif defined(__wasm_simd128__)
-    for (int i = 0; i < nb; i++) {
-        v128_t srcv [8];
-        v128_t asrcv[8];
-        v128_t amaxv[8];
-
-        for (int j = 0; j < 8; j++) srcv[j]  = wasm_v128_load(x + i*32 + 4*j);
-        for (int j = 0; j < 8; j++) asrcv[j] = wasm_f32x4_abs(srcv[j]);
-
-        for (int j = 0; j < 4; j++) amaxv[2*j] = wasm_f32x4_max(asrcv[2*j], asrcv[2*j+1]);
-        for (int j = 0; j < 2; j++) amaxv[4*j] = wasm_f32x4_max(amaxv[4*j], amaxv[4*j+2]);
-        for (int j = 0; j < 1; j++) amaxv[8*j] = wasm_f32x4_max(amaxv[8*j], amaxv[8*j+4]);
-
-        const float amax = MAX(MAX(wasm_f32x4_extract_lane(amaxv[0], 0),
-                                   wasm_f32x4_extract_lane(amaxv[0], 1)),
-                               MAX(wasm_f32x4_extract_lane(amaxv[0], 2),
-                                   wasm_f32x4_extract_lane(amaxv[0], 3)));
-
-        const float d = amax / ((1 << 7) - 1);
-        const float id = d ? 1.0f/d : 0.0f;
-
-        y[i].d = d;
-
-        v128_t accv = wasm_i32x4_splat(0);
-
-        for (int j = 0; j < 8; j++) {
-            const v128_t v  = wasm_f32x4_mul(srcv[j], wasm_f32x4_splat(id));
-            const v128_t vi = wasm_i32x4_trunc_sat_f32x4(v);
-
-            y[i].qs[4*j + 0] = wasm_i32x4_extract_lane(vi, 0);
-            y[i].qs[4*j + 1] = wasm_i32x4_extract_lane(vi, 1);
-            y[i].qs[4*j + 2] = wasm_i32x4_extract_lane(vi, 2);
-            y[i].qs[4*j + 3] = wasm_i32x4_extract_lane(vi, 3);
-
-            accv = wasm_i32x4_add(accv, vi);
-        }
-
-        y[i].s = d * (wasm_i32x4_extract_lane(accv, 0) +
-                      wasm_i32x4_extract_lane(accv, 1) +
-                      wasm_i32x4_extract_lane(accv, 2) +
-                      wasm_i32x4_extract_lane(accv, 3));
-    }
-#elif defined(__AVX2__) || defined(__AVX__)
-    for (int i = 0; i < nb; i++) {
-        // Load elements into 4 AVX vectors
-        __m256 v0 = _mm256_loadu_ps( x );
-        __m256 v1 = _mm256_loadu_ps( x + 8 );
-        __m256 v2 = _mm256_loadu_ps( x + 16 );
-        __m256 v3 = _mm256_loadu_ps( x + 24 );
-        x += 32;
-
-        // Compute max(abs(e)) for the block
-        const __m256 signBit = _mm256_set1_ps( -0.0f );
-        __m256 maxAbs = _mm256_andnot_ps( signBit, v0 );
-        maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v1 ) );
-        maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v2 ) );
-        maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v3 ) );
-
-        __m128 max4 = _mm_max_ps( _mm256_extractf128_ps( maxAbs, 1 ), _mm256_castps256_ps128( maxAbs ) );
-        max4 = _mm_max_ps( max4, _mm_movehl_ps( max4, max4 ) );
-        max4 = _mm_max_ss( max4, _mm_movehdup_ps( max4 ) );
-        const float maxScalar = _mm_cvtss_f32( max4 );
-
-        // Quantize these floats
-        const float d = maxScalar / 127.f;
-        y[i].d = d;
-        const float id = ( maxScalar != 0.0f ) ? 127.f / maxScalar : 0.0f;
-        const __m256 mul = _mm256_set1_ps( id );
-
-        // Apply the multiplier
-        v0 = _mm256_mul_ps( v0, mul );
-        v1 = _mm256_mul_ps( v1, mul );
-        v2 = _mm256_mul_ps( v2, mul );
-        v3 = _mm256_mul_ps( v3, mul );
-
-        // Round to nearest integer
-        v0 = _mm256_round_ps( v0, _MM_ROUND_NEAREST );
-        v1 = _mm256_round_ps( v1, _MM_ROUND_NEAREST );
-        v2 = _mm256_round_ps( v2, _MM_ROUND_NEAREST );
-        v3 = _mm256_round_ps( v3, _MM_ROUND_NEAREST );
-
-        // Convert floats to integers
-        __m256i i0 = _mm256_cvtps_epi32( v0 );
-        __m256i i1 = _mm256_cvtps_epi32( v1 );
-        __m256i i2 = _mm256_cvtps_epi32( v2 );
-        __m256i i3 = _mm256_cvtps_epi32( v3 );
-
-#if defined(__AVX2__)
-        // Compute the sum of the quants and set y[i].s
-        y[i].s = d * hsum_i32_8(_mm256_add_epi32(_mm256_add_epi32(i0, i1), _mm256_add_epi32(i2, i3)));
-
-        // Convert int32 to int16
-        i0 = _mm256_packs_epi32( i0, i1 );	// 0, 1, 2, 3,  8, 9, 10, 11,  4, 5, 6, 7, 12, 13, 14, 15
-        i2 = _mm256_packs_epi32( i2, i3 );	// 16, 17, 18, 19,  24, 25, 26, 27,  20, 21, 22, 23, 28, 29, 30, 31
-                                            // Convert int16 to int8
-        i0 = _mm256_packs_epi16( i0, i2 );	// 0, 1, 2, 3,  8, 9, 10, 11,  16, 17, 18, 19,  24, 25, 26, 27,  4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31
-
-        // We got our precious signed bytes, but the order is now wrong
-        // These AVX2 pack instructions process 16-byte pieces independently
-        // The following instruction is fixing the order
-        const __m256i perm = _mm256_setr_epi32( 0, 4, 1, 5, 2, 6, 3, 7 );
-        i0 = _mm256_permutevar8x32_epi32( i0, perm );
-
-        _mm256_storeu_si256((__m256i *)y[i].qs, i0);
-#else
-        // Since we don't have in AVX some necessary functions,
-        // we split the registers in half and call AVX2 analogs from SSE
-        __m128i ni0 = _mm256_castsi256_si128( i0 );
-        __m128i ni1 = _mm256_extractf128_si256( i0, 1);
-        __m128i ni2 = _mm256_castsi256_si128( i1 );
-        __m128i ni3 = _mm256_extractf128_si256( i1, 1);
-        __m128i ni4 = _mm256_castsi256_si128( i2 );
-        __m128i ni5 = _mm256_extractf128_si256( i2, 1);
-        __m128i ni6 = _mm256_castsi256_si128( i3 );
-        __m128i ni7 = _mm256_extractf128_si256( i3, 1);
-
-        // Compute the sum of the quants and set y[i].s
-        const __m128i s0 = _mm_add_epi32(_mm_add_epi32(ni0, ni1), _mm_add_epi32(ni2, ni3));
-        const __m128i s1 = _mm_add_epi32(_mm_add_epi32(ni4, ni5), _mm_add_epi32(ni6, ni7));
-        y[i].s = d * hsum_i32_4(_mm_add_epi32(s0, s1));
-
-        // Convert int32 to int16
-        ni0 = _mm_packs_epi32( ni0, ni1 );
-        ni2 = _mm_packs_epi32( ni2, ni3 );
-        ni4 = _mm_packs_epi32( ni4, ni5 );
-        ni6 = _mm_packs_epi32( ni6, ni7 );
-        // Convert int16 to int8
-        ni0 = _mm_packs_epi16( ni0, ni2 );
-        ni4 = _mm_packs_epi16( ni4, ni6 );
-
-        _mm_storeu_si128((__m128i *)(y[i].qs +  0), ni0);
-        _mm_storeu_si128((__m128i *)(y[i].qs + 16), ni4);
-#endif
-    }
-#elif defined(__riscv_v_intrinsic)
-
-    size_t vl = __riscv_vsetvl_e32m4(QK8_1);
-
-    for (int i = 0; i < nb; i++) {
-        // load elements
-        vfloat32m4_t v_x   = __riscv_vle32_v_f32m4(x+i*QK8_1, vl);
-
-        vfloat32m4_t vfabs = __riscv_vfabs_v_f32m4(v_x, vl);
-        vfloat32m1_t tmp   = __riscv_vfmv_v_f_f32m1(0.0, vl);
-        vfloat32m1_t vmax  = __riscv_vfredmax_vs_f32m4_f32m1(vfabs, tmp, vl);
-        float amax = __riscv_vfmv_f_s_f32m1_f32(vmax);
-
-        const float d  = amax / ((1 << 7) - 1);
-        const float id = d ? 1.0f/d : 0.0f;
-
-        y[i].d = d;
-
-        vfloat32m4_t x0 = __riscv_vfmul_vf_f32m4(v_x, id, vl);
-
-        // convert to integer
-        vint16m2_t   vi = __riscv_vfncvt_x_f_w_i16m2(x0, vl);
-        vint8m1_t    vs = __riscv_vncvt_x_x_w_i8m1(vi, vl);
-
-        // store result
-        __riscv_vse8_v_i8m1(y[i].qs , vs, vl);
-
-        // compute sum for y[i].s
-        vint16m1_t tmp2 = __riscv_vmv_v_x_i16m1(0, vl);
-        vint16m1_t vwrs = __riscv_vwredsum_vs_i8m1_i16m1(vs, tmp2, vl);
-
-        // set y[i].s
-        int sum = __riscv_vmv_x_s_i16m1_i16(vwrs);
-        y[i].s = sum*d;
-    }
-#else
-    // scalar
-    quantize_row_q8_1_reference(x, y, k);
-#endif
-}
-
-static void dequantize_row_q4_0(const block_q4_0 * restrict x, float * restrict y, int k) {
-    static const int qk = QK4_0;
-
-    assert(k % qk == 0);
-
-    const int nb = k / qk;
-
-    for (int i = 0; i < nb; i++) {
-        const float d = GGML_FP16_TO_FP32(x[i].d);
-
-        for (int j = 0; j < qk/2; ++j) {
-            const int x0 = (x[i].qs[j] & 0x0F) - 8;
-            const int x1 = (x[i].qs[j] >>   4) - 8;
-
-            y[i*qk + j + 0   ] = x0*d;
-            y[i*qk + j + qk/2] = x1*d;
-        }
-    }
-}
-
-static void dequantize_row_q4_1(const block_q4_1 * restrict x, float * restrict y, int k) {
-    static const int qk = QK4_1;
-
-    assert(k % qk == 0);
-
-    const int nb = k / qk;
-
-    for (int i = 0; i < nb; i++) {
-        const float d = GGML_FP16_TO_FP32(x[i].d);
-        const float m = GGML_FP16_TO_FP32(x[i].m);
-
-        for (int j = 0; j < qk/2; ++j) {
-            const int x0 = (x[i].qs[j] & 0x0F);
-            const int x1 = (x[i].qs[j] >>   4);
-
-            y[i*qk + j + 0   ] = x0*d + m;
-            y[i*qk + j + qk/2] = x1*d + m;
-        }
-    }
-}
-
-static void dequantize_row_q5_0(const block_q5_0 * restrict x, float * restrict y, int k) {
-    static const int qk = QK5_0;
-
-    assert(k % qk == 0);
-
-    const int nb = k / qk;
-
-    for (int i = 0; i < nb; i++) {
-        const float d = GGML_FP16_TO_FP32(x[i].d);
-
-        uint32_t qh;
-        memcpy(&qh, x[i].qh, sizeof(qh));
-
-        for (int j = 0; j < qk/2; ++j) {
-            const uint8_t xh_0 = ((qh >> (j +  0)) << 4) & 0x10;
-            const uint8_t xh_1 = ((qh >> (j + 12))     ) & 0x10;
-
-            const int32_t x0 = ((x[i].qs[j] & 0x0F) | xh_0) - 16;
-            const int32_t x1 = ((x[i].qs[j] >>   4) | xh_1) - 16;
-
-            y[i*qk + j + 0   ] = x0*d;
-            y[i*qk + j + qk/2] = x1*d;
-        }
-    }
-}
-
-static void dequantize_row_q5_1(const block_q5_1 * restrict x, float * restrict y, int k) {
-    static const int qk = QK5_1;
-
-    assert(k % qk == 0);
-
-    const int nb = k / qk;
-
-    for (int i = 0; i < nb; i++) {
-        const float d = GGML_FP16_TO_FP32(x[i].d);
-        const float m = GGML_FP16_TO_FP32(x[i].m);
-
-        uint32_t qh;
-        memcpy(&qh, x[i].qh, sizeof(qh));
-
-        for (int j = 0; j < qk/2; ++j) {
-            const uint8_t xh_0 = ((qh >> (j +  0)) << 4) & 0x10;
-            const uint8_t xh_1 = ((qh >> (j + 12))     ) & 0x10;
-
-            const int x0 = (x[i].qs[j] & 0x0F) | xh_0;
-            const int x1 = (x[i].qs[j] >>   4) | xh_1;
-
-            y[i*qk + j + 0   ] = x0*d + m;
-            y[i*qk + j + qk/2] = x1*d + m;
-        }
-    }
-}
-
-static void dequantize_row_q8_0(const void * restrict vx, float * restrict y, int k) {
-    static const int qk = QK8_0;
-
-    assert(k % qk == 0);
-
-    const int nb = k / qk;
-
-    const block_q8_0 * restrict x = vx;
-
-    for (int i = 0; i < nb; i++) {
-        const float d = GGML_FP16_TO_FP32(x[i].d);
-
-        for (int j = 0; j < qk; ++j) {
-            y[i*qk + j] = x[i].qs[j]*d;
-        }
-    }
-}
-
 static void ggml_vec_dot_f32(const int n, float * restrict s, const float * restrict x, const float * restrict y);
 static void ggml_vec_dot_f16(const int n, float * restrict s, ggml_fp16_t * restrict x, ggml_fp16_t * restrict y);
-static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy);
-static void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void * restrict vx, const void * restrict vy);
-static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy);
-static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void * restrict vx, const void * restrict vy);
-static void ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy);
 
 static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
     [GGML_TYPE_I8] = {
@@ -1714,6 +423,28 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
         .vec_dot                  = ggml_vec_dot_q4_1_q8_1,
         .vec_dot_type             = GGML_TYPE_Q8_1,
     },
+    [4] = { // GGML_TYPE_Q4_2
+        .type_name                = "DEPRECATED",
+        .blck_size                = 0,
+        .type_size                = 0,
+        .is_quantized             = false,
+        .to_float                 = NULL,
+        .from_float               = NULL,
+        .from_float_reference     = NULL,
+        .vec_dot                  = NULL,
+        .vec_dot_type             = GGML_TYPE_COUNT,
+    },
+    [5] = { // GGML_TYPE_Q4_3
+        .type_name                = "DEPRECATED",
+        .blck_size                = 0,
+        .type_size                = 0,
+        .is_quantized             = false,
+        .to_float                 = NULL,
+        .from_float               = NULL,
+        .from_float_reference     = NULL,
+        .vec_dot                  = NULL,
+        .vec_dot_type             = GGML_TYPE_COUNT,
+    },
     [GGML_TYPE_Q5_0] = {
         .type_name                = "q5_0",
         .blck_size                = QK5_0,
@@ -1741,7 +472,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
         .blck_size                = QK8_0,
         .type_size                = sizeof(block_q8_0),
         .is_quantized             = true,
-        .to_float                 = dequantize_row_q8_0,
+        .to_float                 = (ggml_to_float_t) dequantize_row_q8_0,
         .from_float               = quantize_row_q8_0,
         .from_float_reference     = (ggml_from_float_t) quantize_row_q8_0_reference,
         .vec_dot                  = ggml_vec_dot_q8_0_q8_0,
@@ -1756,7 +487,6 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
         .from_float_reference     = (ggml_from_float_t) quantize_row_q8_1_reference,
         .vec_dot_type             = GGML_TYPE_Q8_1,
     },
-#ifdef GGML_USE_K_QUANTS
     [GGML_TYPE_Q2_K] = {
         .type_name                = "q2_K",
         .blck_size                = QK_K,
@@ -1819,7 +549,6 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
         .is_quantized             = true,
         .from_float               = quantize_row_q8_K,
     }
-#endif
 };
 
 // For internal test use
@@ -1828,7 +557,6 @@ ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type) {
     return type_traits[type];
 }
 
-
 //
 // simd mappings
 //
@@ -2444,1218 +1172,6 @@ static void ggml_vec_dot_f16(const int n, float * restrict s, ggml_fp16_t * rest
     *s = sumf;
 }
 
-static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
-    const int qk = QK8_0;
-    const int nb = n / qk;
-
-    assert(n % qk == 0);
-
-    const block_q4_0 * restrict x = vx;
-    const block_q8_0 * restrict y = vy;
-
-#if defined(__ARM_NEON)
-    float32x4_t sumv0 = vdupq_n_f32(0.0f);
-    float32x4_t sumv1 = vdupq_n_f32(0.0f);
-
-    GGML_ASSERT(nb % 2 == 0); // TODO: handle odd nb
-    for (int i = 0; i < nb; i += 2) {
-        const block_q4_0 * restrict x0 = &x[i + 0];
-        const block_q4_0 * restrict x1 = &x[i + 1];
-        const block_q8_0 * restrict y0 = &y[i + 0];
-        const block_q8_0 * restrict y1 = &y[i + 1];
-
-        const uint8x16_t m4b = vdupq_n_u8(0x0F);
-        const int8x16_t  s8b = vdupq_n_s8(0x8);
-
-        const uint8x16_t v0_0 = vld1q_u8(x0->qs);
-        const uint8x16_t v0_1 = vld1q_u8(x1->qs);
-
-        // 4-bit -> 8-bit
-        const int8x16_t v0_0l = vreinterpretq_s8_u8(vandq_u8  (v0_0, m4b));
-        const int8x16_t v0_0h = vreinterpretq_s8_u8(vshrq_n_u8(v0_0, 4));
-        const int8x16_t v0_1l = vreinterpretq_s8_u8(vandq_u8  (v0_1, m4b));
-        const int8x16_t v0_1h = vreinterpretq_s8_u8(vshrq_n_u8(v0_1, 4));
-
-        // sub 8
-        const int8x16_t v0_0ls = vsubq_s8(v0_0l, s8b);
-        const int8x16_t v0_0hs = vsubq_s8(v0_0h, s8b);
-        const int8x16_t v0_1ls = vsubq_s8(v0_1l, s8b);
-        const int8x16_t v0_1hs = vsubq_s8(v0_1h, s8b);
-
-        // load y
-        const int8x16_t v1_0l = vld1q_s8(y0->qs);
-        const int8x16_t v1_0h = vld1q_s8(y0->qs + 16);
-        const int8x16_t v1_1l = vld1q_s8(y1->qs);
-        const int8x16_t v1_1h = vld1q_s8(y1->qs + 16);
-
-#if defined(__ARM_FEATURE_DOTPROD)
-        // dot product into int32x4_t
-        const int32x4_t p_0 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_0ls, v1_0l), v0_0hs, v1_0h);
-        const int32x4_t p_1 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_1ls, v1_1l), v0_1hs, v1_1h);
-
-        sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(p_0), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
-        sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(p_1), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
-#else
-        const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0_0ls), vget_low_s8 (v1_0l));
-        const int16x8_t pl0h = vmull_s8(vget_high_s8(v0_0ls), vget_high_s8(v1_0l));
-        const int16x8_t ph0l = vmull_s8(vget_low_s8 (v0_0hs), vget_low_s8 (v1_0h));
-        const int16x8_t ph0h = vmull_s8(vget_high_s8(v0_0hs), vget_high_s8(v1_0h));
-
-        const int16x8_t pl1l = vmull_s8(vget_low_s8 (v0_1ls), vget_low_s8 (v1_1l));
-        const int16x8_t pl1h = vmull_s8(vget_high_s8(v0_1ls), vget_high_s8(v1_1l));
-        const int16x8_t ph1l = vmull_s8(vget_low_s8 (v0_1hs), vget_low_s8 (v1_1h));
-        const int16x8_t ph1h = vmull_s8(vget_high_s8(v0_1hs), vget_high_s8(v1_1h));
-
-        const int32x4_t pl0 = vaddq_s32(vpaddlq_s16(pl0l), vpaddlq_s16(pl0h));
-        const int32x4_t ph0 = vaddq_s32(vpaddlq_s16(ph0l), vpaddlq_s16(ph0h));
-        const int32x4_t pl1 = vaddq_s32(vpaddlq_s16(pl1l), vpaddlq_s16(pl1h));
-        const int32x4_t ph1 = vaddq_s32(vpaddlq_s16(ph1l), vpaddlq_s16(ph1h));
-
-        sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(pl0, ph0)), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
-        sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(pl1, ph1)), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
-#endif
-    }
-
-    *s = vaddvq_f32(sumv0) + vaddvq_f32(sumv1);
-#elif defined(__AVX2__)
-    // Initialize accumulator with zeros
-    __m256 acc = _mm256_setzero_ps();
-
-    // Main loop
-    for (int i = 0; i < nb; ++i) {
-        /* Compute combined scale for the block */
-        const __m256 d = _mm256_set1_ps( GGML_FP16_TO_FP32(x[i].d) * GGML_FP16_TO_FP32(y[i].d) );
-
-        __m256i bx = bytes_from_nibbles_32(x[i].qs);
-
-        // Now we have a vector with bytes in [ 0 .. 15 ] interval. Offset them into [ -8 .. +7 ] interval.
-        const __m256i off = _mm256_set1_epi8( 8 );
-        bx = _mm256_sub_epi8( bx, off );
-
-        __m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs);
-
-        const __m256 q = mul_sum_i8_pairs_float(bx, by);
-
-        /* Multiply q with scale and accumulate */
-        acc = _mm256_fmadd_ps( d, q, acc );
-    }
-
-    *s = hsum_float_8(acc);
-#elif defined(__AVX__)
-    // Initialize accumulator with zeros
-    __m256 acc = _mm256_setzero_ps();
-
-    // Main loop
-    for (int i = 0; i < nb; ++i) {
-        // Compute combined scale for the block
-        const __m256 d = _mm256_set1_ps( GGML_FP16_TO_FP32(x[i].d) * GGML_FP16_TO_FP32(y[i].d) );
-
-        const __m128i lowMask = _mm_set1_epi8(0xF);
-        const __m128i off = _mm_set1_epi8(8);
-
-        const __m128i tmp = _mm_loadu_si128((const __m128i *)x[i].qs);
-
-        __m128i bx = _mm_and_si128(lowMask, tmp);
-        __m128i by = _mm_loadu_si128((const __m128i *)y[i].qs);
-        bx = _mm_sub_epi8(bx, off);
-        const __m128i i32_0 = mul_sum_i8_pairs(bx, by);
-
-        bx = _mm_and_si128(lowMask, _mm_srli_epi64(tmp, 4));
-        by = _mm_loadu_si128((const __m128i *)(y[i].qs + 16));
-        bx = _mm_sub_epi8(bx, off);
-        const __m128i i32_1 = mul_sum_i8_pairs(bx, by);
-
-        // Convert int32_t to float
-        __m256 p = _mm256_cvtepi32_ps(MM256_SET_M128I(i32_0, i32_1));
-
-        // Apply the scale, and accumulate
-        acc = _mm256_add_ps(_mm256_mul_ps( d, p ), acc);
-    }
-
-    *s = hsum_float_8(acc);
-#elif defined(__SSSE3__)
-    // set constants
-    const __m128i lowMask = _mm_set1_epi8(0xF);
-    const __m128i off = _mm_set1_epi8(8);
-
-    // Initialize accumulator with zeros
-    __m128 acc_0 = _mm_setzero_ps();
-    __m128 acc_1 = _mm_setzero_ps();
-    __m128 acc_2 = _mm_setzero_ps();
-    __m128 acc_3 = _mm_setzero_ps();
-
-    // First round without accumulation
-    {
-        _mm_prefetch(&x[0] + sizeof(block_q4_0), _MM_HINT_T0);
-        _mm_prefetch(&y[0] + sizeof(block_q8_0), _MM_HINT_T0);
-
-        // Compute combined scale for the block 0 and 1
-        const __m128 d_0_1 = _mm_set1_ps( GGML_FP16_TO_FP32(x[0].d) * GGML_FP16_TO_FP32(y[0].d) );
-
-        const __m128i tmp_0_1 = _mm_loadu_si128((const __m128i *)x[0].qs);
-
-        __m128i bx_0 = _mm_and_si128(lowMask, tmp_0_1);
-        __m128i by_0 = _mm_loadu_si128((const __m128i *)y[0].qs);
-        bx_0 = _mm_sub_epi8(bx_0, off);
-        const __m128i i32_0 = mul_sum_i8_pairs(bx_0, by_0);
-
-        __m128i bx_1 = _mm_and_si128(lowMask, _mm_srli_epi64(tmp_0_1, 4));
-        __m128i by_1 = _mm_loadu_si128((const __m128i *)(y[0].qs + 16));
-        bx_1 = _mm_sub_epi8(bx_1, off);
-        const __m128i i32_1 = mul_sum_i8_pairs(bx_1, by_1);
-
-        _mm_prefetch(&x[1] + sizeof(block_q4_0), _MM_HINT_T0);
-        _mm_prefetch(&y[1] + sizeof(block_q8_0), _MM_HINT_T0);
-
-        // Compute combined scale for the block 2 and 3
-        const __m128 d_2_3 = _mm_set1_ps( GGML_FP16_TO_FP32(x[1].d) * GGML_FP16_TO_FP32(y[1].d) );
-
-        const __m128i tmp_2_3 = _mm_loadu_si128((const __m128i *)x[1].qs);
-
-        __m128i bx_2 = _mm_and_si128(lowMask, tmp_2_3);
-        __m128i by_2 = _mm_loadu_si128((const __m128i *)y[1].qs);
-        bx_2 = _mm_sub_epi8(bx_2, off);
-        const __m128i i32_2 = mul_sum_i8_pairs(bx_2, by_2);
-
-        __m128i bx_3 = _mm_and_si128(lowMask, _mm_srli_epi64(tmp_2_3, 4));
-        __m128i by_3 = _mm_loadu_si128((const __m128i *)(y[1].qs + 16));
-        bx_3 = _mm_sub_epi8(bx_3, off);
-        const __m128i i32_3 = mul_sum_i8_pairs(bx_3, by_3);
-
-        // Convert int32_t to float
-        __m128 p0 = _mm_cvtepi32_ps(i32_0);
-        __m128 p1 = _mm_cvtepi32_ps(i32_1);
-        __m128 p2 = _mm_cvtepi32_ps(i32_2);
-        __m128 p3 = _mm_cvtepi32_ps(i32_3);
-
-        // Apply the scale
-        acc_0 = _mm_mul_ps( d_0_1, p0 );
-        acc_1 = _mm_mul_ps( d_0_1, p1 );
-        acc_2 = _mm_mul_ps( d_2_3, p2 );
-        acc_3 = _mm_mul_ps( d_2_3, p3 );
-    }
-
-    // Main loop
-    GGML_ASSERT(nb % 2 == 0); // TODO: handle odd nb
-    for (int i = 2; i < nb; i+=2) {
-        _mm_prefetch(&x[i] + sizeof(block_q4_0), _MM_HINT_T0);
-        _mm_prefetch(&y[i] + sizeof(block_q8_0), _MM_HINT_T0);
-
-        // Compute combined scale for the block 0 and 1
-        const __m128 d_0_1 = _mm_set1_ps( GGML_FP16_TO_FP32(x[i].d) * GGML_FP16_TO_FP32(y[i].d) );
-
-        const __m128i tmp_0_1 = _mm_loadu_si128((const __m128i *)x[i].qs);
-
-        __m128i bx_0 = _mm_and_si128(lowMask, tmp_0_1);
-        __m128i by_0 = _mm_loadu_si128((const __m128i *)y[i].qs);
-        bx_0 = _mm_sub_epi8(bx_0, off);
-        const __m128i i32_0 = mul_sum_i8_pairs(bx_0, by_0);
-
-        __m128i bx_1 = _mm_and_si128(lowMask, _mm_srli_epi64(tmp_0_1, 4));
-        __m128i by_1 = _mm_loadu_si128((const __m128i *)(y[i].qs + 16));
-        bx_1 = _mm_sub_epi8(bx_1, off);
-        const __m128i i32_1 = mul_sum_i8_pairs(bx_1, by_1);
-
-        _mm_prefetch(&x[i] + 2 * sizeof(block_q4_0), _MM_HINT_T0);
-        _mm_prefetch(&y[i] + 2 * sizeof(block_q8_0), _MM_HINT_T0);
-
-        // Compute combined scale for the block 2 and 3
-        const __m128 d_2_3 = _mm_set1_ps( GGML_FP16_TO_FP32(x[i + 1].d) * GGML_FP16_TO_FP32(y[i + 1].d) );
-
-        const __m128i tmp_2_3 = _mm_loadu_si128((const __m128i *)x[i + 1].qs);
-
-        __m128i bx_2 = _mm_and_si128(lowMask, tmp_2_3);
-        __m128i by_2 = _mm_loadu_si128((const __m128i *)y[i + 1].qs);
-        bx_2 = _mm_sub_epi8(bx_2, off);
-        const __m128i i32_2 = mul_sum_i8_pairs(bx_2, by_2);
-
-        __m128i bx_3 = _mm_and_si128(lowMask, _mm_srli_epi64(tmp_2_3, 4));
-        __m128i by_3 = _mm_loadu_si128((const __m128i *)(y[i + 1].qs + 16));
-        bx_3 = _mm_sub_epi8(bx_3, off);
-        const __m128i i32_3 = mul_sum_i8_pairs(bx_3, by_3);
-
-        // Convert int32_t to float
-        __m128 p0 = _mm_cvtepi32_ps(i32_0);
-        __m128 p1 = _mm_cvtepi32_ps(i32_1);
-        __m128 p2 = _mm_cvtepi32_ps(i32_2);
-        __m128 p3 = _mm_cvtepi32_ps(i32_3);
-
-        // Apply the scale
-        __m128 p0_d = _mm_mul_ps( d_0_1, p0 );
-        __m128 p1_d = _mm_mul_ps( d_0_1, p1 );
-        __m128 p2_d = _mm_mul_ps( d_2_3, p2 );
-        __m128 p3_d = _mm_mul_ps( d_2_3, p3 );
-
-        // Acummulate
-        acc_0 = _mm_add_ps(p0_d, acc_0);
-        acc_1 = _mm_add_ps(p1_d, acc_1);
-        acc_2 = _mm_add_ps(p2_d, acc_2);
-        acc_3 = _mm_add_ps(p3_d, acc_3);
-    }
-
-    *s = hsum_float_4x4(acc_0, acc_1, acc_2, acc_3);
-#elif defined(__riscv_v_intrinsic)
-    float sumf = 0.0;
-
-    size_t vl = __riscv_vsetvl_e8m1(qk/2);
-
-    for (int i = 0; i < nb; i++) {
-        // load elements
-        vuint8mf2_t tx = __riscv_vle8_v_u8mf2(x[i].qs, vl);
-
-        vint8mf2_t y0 = __riscv_vle8_v_i8mf2(y[i].qs, vl);
-        vint8mf2_t y1 = __riscv_vle8_v_i8mf2(y[i].qs+16, vl);
-
-        // mask and store lower part of x, and then upper part
-        vuint8mf2_t x_a = __riscv_vand_vx_u8mf2(tx, 0x0F, vl);
-        vuint8mf2_t x_l = __riscv_vsrl_vx_u8mf2(tx, 0x04, vl);
-
-        vint8mf2_t x_ai = __riscv_vreinterpret_v_u8mf2_i8mf2(x_a);
-        vint8mf2_t x_li = __riscv_vreinterpret_v_u8mf2_i8mf2(x_l);
-
-        // subtract offset
-        vint8mf2_t v0 = __riscv_vsub_vx_i8mf2(x_ai, 8, vl);
-        vint8mf2_t v1 = __riscv_vsub_vx_i8mf2(x_li, 8, vl);
-
-        vint16m1_t vec_mul1 = __riscv_vwmul_vv_i16m1(v0, y0, vl);
-        vint16m1_t vec_mul2 = __riscv_vwmul_vv_i16m1(v1, y1, vl);
-
-        vint32m1_t vec_zero = __riscv_vmv_v_x_i32m1(0, vl);
-
-        vint32m1_t vs1 = __riscv_vwredsum_vs_i16m1_i32m1(vec_mul1, vec_zero, vl);
-        vint32m1_t vs2 = __riscv_vwredsum_vs_i16m1_i32m1(vec_mul2, vs1, vl);
-
-        int sumi = __riscv_vmv_x_s_i32m1_i32(vs2);
-
-        sumf += sumi*GGML_FP16_TO_FP32(x[i].d)*GGML_FP16_TO_FP32(y[i].d);
-    }
-
-    *s = sumf;
-#else
-    // scalar
-    float sumf = 0.0;
-
-    for (int i = 0; i < nb; i++) {
-        int sumi = 0;
-
-        for (int j = 0; j < qk/2; ++j) {
-            const int v0 = (x[i].qs[j] & 0x0F) - 8;
-            const int v1 = (x[i].qs[j] >>   4) - 8;
-
-            sumi += (v0 * y[i].qs[j]) + (v1 * y[i].qs[j + qk/2]);
-        }
-
-        sumf += sumi*GGML_FP16_TO_FP32(x[i].d)*GGML_FP16_TO_FP32(y[i].d);
-    }
-
-    *s = sumf;
-#endif
-}
-
-static void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
-    const int qk = QK8_1;
-    const int nb = n / qk;
-
-    assert(n % qk == 0);
-
-    const block_q4_1 * restrict x = vx;
-    const block_q8_1 * restrict y = vy;
-
-    // TODO: add WASM SIMD
-#if defined(__ARM_NEON)
-    float32x4_t sumv0 = vdupq_n_f32(0.0f);
-    float32x4_t sumv1 = vdupq_n_f32(0.0f);
-
-    float summs = 0;
-
-    GGML_ASSERT(nb % 2 == 0); // TODO: handle odd nb
-    for (int i = 0; i < nb; i += 2) {
-        const block_q4_1 * restrict x0 = &x[i + 0];
-        const block_q4_1 * restrict x1 = &x[i + 1];
-        const block_q8_1 * restrict y0 = &y[i + 0];
-        const block_q8_1 * restrict y1 = &y[i + 1];
-
-        summs += GGML_FP16_TO_FP32(x0->m) * y0->s + GGML_FP16_TO_FP32(x1->m) * y1->s;
-
-        const uint8x16_t m4b = vdupq_n_u8(0x0F);
-
-        const uint8x16_t v0_0 = vld1q_u8(x0->qs);
-        const uint8x16_t v0_1 = vld1q_u8(x1->qs);
-
-        // 4-bit -> 8-bit
-        const int8x16_t v0_0l = vreinterpretq_s8_u8(vandq_u8  (v0_0, m4b));
-        const int8x16_t v0_0h = vreinterpretq_s8_u8(vshrq_n_u8(v0_0, 4));
-        const int8x16_t v0_1l = vreinterpretq_s8_u8(vandq_u8  (v0_1, m4b));
-        const int8x16_t v0_1h = vreinterpretq_s8_u8(vshrq_n_u8(v0_1, 4));
-
-        // load y
-        const int8x16_t v1_0l = vld1q_s8(y0->qs);
-        const int8x16_t v1_0h = vld1q_s8(y0->qs + 16);
-        const int8x16_t v1_1l = vld1q_s8(y1->qs);
-        const int8x16_t v1_1h = vld1q_s8(y1->qs + 16);
-
-#if defined(__ARM_FEATURE_DOTPROD)
-        // dot product into int32x4_t
-        const int32x4_t p_0 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_0l, v1_0l), v0_0h, v1_0h);
-        const int32x4_t p_1 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_1l, v1_1l), v0_1h, v1_1h);
-
-        sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(p_0), GGML_FP16_TO_FP32(x0->d)*y0->d);
-        sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(p_1), GGML_FP16_TO_FP32(x1->d)*y1->d);
-#else
-        const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0_0l), vget_low_s8 (v1_0l));
-        const int16x8_t pl0h = vmull_s8(vget_high_s8(v0_0l), vget_high_s8(v1_0l));
-        const int16x8_t ph0l = vmull_s8(vget_low_s8 (v0_0h), vget_low_s8 (v1_0h));
-        const int16x8_t ph0h = vmull_s8(vget_high_s8(v0_0h), vget_high_s8(v1_0h));
-
-        const int16x8_t pl1l = vmull_s8(vget_low_s8 (v0_1l), vget_low_s8 (v1_1l));
-        const int16x8_t pl1h = vmull_s8(vget_high_s8(v0_1l), vget_high_s8(v1_1l));
-        const int16x8_t ph1l = vmull_s8(vget_low_s8 (v0_1h), vget_low_s8 (v1_1h));
-        const int16x8_t ph1h = vmull_s8(vget_high_s8(v0_1h), vget_high_s8(v1_1h));
-
-        const int32x4_t pl0 = vaddq_s32(vpaddlq_s16(pl0l), vpaddlq_s16(pl0h));
-        const int32x4_t ph0 = vaddq_s32(vpaddlq_s16(ph0l), vpaddlq_s16(ph0h));
-        const int32x4_t pl1 = vaddq_s32(vpaddlq_s16(pl1l), vpaddlq_s16(pl1h));
-        const int32x4_t ph1 = vaddq_s32(vpaddlq_s16(ph1l), vpaddlq_s16(ph1h));
-
-        sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(pl0, ph0)), GGML_FP16_TO_FP32(x0->d)*y0->d);
-        sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(pl1, ph1)), GGML_FP16_TO_FP32(x1->d)*y1->d);
-#endif
-    }
-
-    *s = vaddvq_f32(sumv0) + vaddvq_f32(sumv1) + summs;
-#elif defined(__AVX2__) || defined(__AVX__)
-    // Initialize accumulator with zeros
-    __m256 acc = _mm256_setzero_ps();
-
-    float summs = 0;
-
-    // Main loop
-    for (int i = 0; i < nb; ++i) {
-        const float d0 = GGML_FP16_TO_FP32(x[i].d);
-        const float d1 = y[i].d;
-
-        summs += GGML_FP16_TO_FP32(x[i].m) * y[i].s;
-
-        const __m256 d0v = _mm256_set1_ps( d0 );
-        const __m256 d1v = _mm256_set1_ps( d1 );
-
-        // Compute combined scales
-        const __m256 d0d1 = _mm256_mul_ps( d0v, d1v );
-
-        // Load 16 bytes, and unpack 4 bit fields into bytes, making 32 bytes
-        const __m256i bx = bytes_from_nibbles_32(x[i].qs);
-        const __m256i by = _mm256_loadu_si256( (const __m256i *)y[i].qs );
-
-        const __m256 xy = mul_sum_us8_pairs_float(bx, by);
-
-        // Accumulate d0*d1*x*y
-#if defined(__AVX2__)
-        acc = _mm256_fmadd_ps( d0d1, xy, acc );
-#else
-        acc = _mm256_add_ps( _mm256_mul_ps( d0d1, xy ), acc );
-#endif
-    }
-
-    *s = hsum_float_8(acc) + summs;
-#elif defined(__riscv_v_intrinsic)
-    float sumf = 0.0;
-
-    size_t vl = __riscv_vsetvl_e8m1(qk/2);
-
-    for (int i = 0; i < nb; i++) {
-        // load elements
-        vuint8mf2_t tx = __riscv_vle8_v_u8mf2(x[i].qs, vl);
-
-        vint8mf2_t y0 = __riscv_vle8_v_i8mf2(y[i].qs, vl);
-        vint8mf2_t y1 = __riscv_vle8_v_i8mf2(y[i].qs+16, vl);
-
-        // mask and store lower part of x, and then upper part
-        vuint8mf2_t x_a = __riscv_vand_vx_u8mf2(tx, 0x0F, vl);
-        vuint8mf2_t x_l = __riscv_vsrl_vx_u8mf2(tx, 0x04, vl);
-
-        vint8mf2_t v0 = __riscv_vreinterpret_v_u8mf2_i8mf2(x_a);
-        vint8mf2_t v1 = __riscv_vreinterpret_v_u8mf2_i8mf2(x_l);
-
-        vint16m1_t vec_mul1 = __riscv_vwmul_vv_i16m1(v0, y0, vl);
-        vint16m1_t vec_mul2 = __riscv_vwmul_vv_i16m1(v1, y1, vl);
-
-        vint32m1_t vec_zero = __riscv_vmv_v_x_i32m1(0, vl);
-
-        vint32m1_t vs1 = __riscv_vwredsum_vs_i16m1_i32m1(vec_mul1, vec_zero, vl);
-        vint32m1_t vs2 = __riscv_vwredsum_vs_i16m1_i32m1(vec_mul2, vs1, vl);
-
-        int sumi = __riscv_vmv_x_s_i32m1_i32(vs2);
-
-        sumf += (GGML_FP16_TO_FP32(x[i].d)*y[i].d)*sumi + GGML_FP16_TO_FP32(x[i].m)*y[i].s;
-    }
-
-    *s = sumf;
-#else
-    // scalar
-    float sumf = 0.0;
-
-    for (int i = 0; i < nb; i++) {
-        int sumi = 0;
-
-        for (int j = 0; j < qk/2; ++j) {
-            const int v0 = (x[i].qs[j] & 0x0F);
-            const int v1 = (x[i].qs[j] >>   4);
-
-            sumi += (v0 * y[i].qs[j]) + (v1 * y[i].qs[j + qk/2]);
-        }
-
-        sumf += (GGML_FP16_TO_FP32(x[i].d)*y[i].d)*sumi + GGML_FP16_TO_FP32(x[i].m)*y[i].s;
-    }
-
-    *s = sumf;
-#endif
-}
-
-static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
-    const int qk = QK8_0;
-    const int nb = n / qk;
-
-    assert(n % qk == 0);
-    assert(qk == QK5_0);
-
-    const block_q5_0 * restrict x = vx;
-    const block_q8_0 * restrict y = vy;
-
-#if defined(__ARM_NEON)
-    float32x4_t sumv0 = vdupq_n_f32(0.0f);
-    float32x4_t sumv1 = vdupq_n_f32(0.0f);
-
-    uint32_t qh0;
-    uint32_t qh1;
-
-    uint64_t tmp0[4];
-    uint64_t tmp1[4];
-
-    GGML_ASSERT(nb % 2 == 0); // TODO: handle odd nb
-    for (int i = 0; i < nb; i += 2) {
-        const block_q5_0 * restrict x0 = &x[i];
-        const block_q5_0 * restrict x1 = &x[i + 1];
-        const block_q8_0 * restrict y0 = &y[i];
-        const block_q8_0 * restrict y1 = &y[i + 1];
-
-        const uint8x16_t m4b = vdupq_n_u8(0x0F);
-
-        // extract the 5th bit via lookup table ((!b) << 4)
-        memcpy(&qh0, x0->qh, sizeof(qh0));
-        memcpy(&qh1, x1->qh, sizeof(qh1));
-
-        tmp0[0] = table_b2b_1[(qh0 >>  0) & 0xFF];
-        tmp0[1] = table_b2b_1[(qh0 >>  8) & 0xFF];
-        tmp0[2] = table_b2b_1[(qh0 >> 16) & 0xFF];
-        tmp0[3] = table_b2b_1[(qh0 >> 24)       ];
-
-        tmp1[0] = table_b2b_1[(qh1 >>  0) & 0xFF];
-        tmp1[1] = table_b2b_1[(qh1 >>  8) & 0xFF];
-        tmp1[2] = table_b2b_1[(qh1 >> 16) & 0xFF];
-        tmp1[3] = table_b2b_1[(qh1 >> 24)       ];
-
-        const int8x16_t qhl0 = vld1q_s8((const int8_t *)(tmp0 + 0));
-        const int8x16_t qhh0 = vld1q_s8((const int8_t *)(tmp0 + 2));
-        const int8x16_t qhl1 = vld1q_s8((const int8_t *)(tmp1 + 0));
-        const int8x16_t qhh1 = vld1q_s8((const int8_t *)(tmp1 + 2));
-
-        const uint8x16_t v0_0 = vld1q_u8(x0->qs);
-        const uint8x16_t v0_1 = vld1q_u8(x1->qs);
-
-        // 4-bit -> 8-bit
-        int8x16_t v0_0l = vreinterpretq_s8_u8(vandq_u8  (v0_0, m4b));
-        int8x16_t v0_0h = vreinterpretq_s8_u8(vshrq_n_u8(v0_0, 4));
-        int8x16_t v0_1l = vreinterpretq_s8_u8(vandq_u8  (v0_1, m4b));
-        int8x16_t v0_1h = vreinterpretq_s8_u8(vshrq_n_u8(v0_1, 4));
-
-        // add high bit and sub 16 (equivalent to sub 0x10 when bit is zero)
-        const int8x16_t v0_0lf = vsubq_s8(v0_0l, qhl0);
-        const int8x16_t v0_0hf = vsubq_s8(v0_0h, qhh0);
-        const int8x16_t v0_1lf = vsubq_s8(v0_1l, qhl1);
-        const int8x16_t v0_1hf = vsubq_s8(v0_1h, qhh1);
-
-        // load y
-        const int8x16_t v1_0l = vld1q_s8(y0->qs);
-        const int8x16_t v1_0h = vld1q_s8(y0->qs + 16);
-        const int8x16_t v1_1l = vld1q_s8(y1->qs);
-        const int8x16_t v1_1h = vld1q_s8(y1->qs + 16);
-
-#if defined(__ARM_FEATURE_DOTPROD)
-        sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(
-                        vdotq_s32(vdupq_n_s32(0), v0_0lf, v1_0l),
-                        vdotq_s32(vdupq_n_s32(0), v0_0hf, v1_0h))), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
-        sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(
-                        vdotq_s32(vdupq_n_s32(0), v0_1lf, v1_1l),
-                        vdotq_s32(vdupq_n_s32(0), v0_1hf, v1_1h))), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
-#else
-        const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0_0lf), vget_low_s8 (v1_0l));
-        const int16x8_t pl0h = vmull_s8(vget_high_s8(v0_0lf), vget_high_s8(v1_0l));
-        const int16x8_t ph0l = vmull_s8(vget_low_s8 (v0_0hf), vget_low_s8 (v1_0h));
-        const int16x8_t ph0h = vmull_s8(vget_high_s8(v0_0hf), vget_high_s8(v1_0h));
-
-        const int16x8_t pl1l = vmull_s8(vget_low_s8 (v0_1lf), vget_low_s8 (v1_1l));
-        const int16x8_t pl1h = vmull_s8(vget_high_s8(v0_1lf), vget_high_s8(v1_1l));
-        const int16x8_t ph1l = vmull_s8(vget_low_s8 (v0_1hf), vget_low_s8 (v1_1h));
-        const int16x8_t ph1h = vmull_s8(vget_high_s8(v0_1hf), vget_high_s8(v1_1h));
-
-        const int32x4_t pl0 = vaddq_s32(vpaddlq_s16(pl0l), vpaddlq_s16(pl0h));
-        const int32x4_t ph0 = vaddq_s32(vpaddlq_s16(ph0l), vpaddlq_s16(ph0h));
-        const int32x4_t pl1 = vaddq_s32(vpaddlq_s16(pl1l), vpaddlq_s16(pl1h));
-        const int32x4_t ph1 = vaddq_s32(vpaddlq_s16(ph1l), vpaddlq_s16(ph1h));
-
-        sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(pl0, ph0)), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
-        sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(pl1, ph1)), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
-#endif
-    }
-
-    *s = vaddvq_f32(sumv0) + vaddvq_f32(sumv1);
-#elif defined(__wasm_simd128__)
-    v128_t sumv = wasm_f32x4_splat(0.0f);
-
-    uint32_t qh;
-    uint64_t tmp[4];
-
-    // TODO: check if unrolling this is better
-    for (int i = 0; i < nb; ++i) {
-        const block_q5_0 * restrict x0 = &x[i];
-        const block_q8_0 * restrict y0 = &y[i];
-
-        const v128_t m4b  = wasm_i8x16_splat(0x0F);
-
-        // extract the 5th bit
-        memcpy(&qh, x0->qh, sizeof(qh));
-
-        tmp[0] = table_b2b_1[(qh >>  0) & 0xFF];
-        tmp[1] = table_b2b_1[(qh >>  8) & 0xFF];
-        tmp[2] = table_b2b_1[(qh >> 16) & 0xFF];
-        tmp[3] = table_b2b_1[(qh >> 24)       ];
-
-        const v128_t qhl = wasm_v128_load(tmp + 0);
-        const v128_t qhh = wasm_v128_load(tmp + 2);
-
-        const v128_t v0 = wasm_v128_load(x0->qs);
-
-        // 4-bit -> 8-bit
-        const v128_t v0l = wasm_v128_and (v0, m4b);
-        const v128_t v0h = wasm_u8x16_shr(v0, 4);
-
-        // add high bit and sub 16 (equivalent to sub 0x10 when bit is zero)
-        const v128_t v0lf = wasm_i8x16_sub(v0l, qhl);
-        const v128_t v0hf = wasm_i8x16_sub(v0h, qhh);
-
-        // load y
-        const v128_t v1l = wasm_v128_load(y0->qs);
-        const v128_t v1h = wasm_v128_load(y0->qs + 16);
-
-        // int8x16 -> int16x8
-        const v128_t v0lfl = wasm_i16x8_extend_low_i8x16 (v0lf);
-        const v128_t v0lfh = wasm_i16x8_extend_high_i8x16(v0lf);
-        const v128_t v0hfl = wasm_i16x8_extend_low_i8x16 (v0hf);
-        const v128_t v0hfh = wasm_i16x8_extend_high_i8x16(v0hf);
-
-        const v128_t v1ll = wasm_i16x8_extend_low_i8x16 (v1l);
-        const v128_t v1lh = wasm_i16x8_extend_high_i8x16(v1l);
-        const v128_t v1hl = wasm_i16x8_extend_low_i8x16 (v1h);
-        const v128_t v1hh = wasm_i16x8_extend_high_i8x16(v1h);
-
-        // dot product
-        sumv = wasm_f32x4_add(sumv, wasm_f32x4_mul(wasm_f32x4_convert_i32x4(
-                        wasm_i32x4_add(
-                            wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0lfl, v1ll),
-                                           wasm_i32x4_dot_i16x8(v0lfh, v1lh)),
-                            wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0hfl, v1hl),
-                                           wasm_i32x4_dot_i16x8(v0hfh, v1hh)))),
-                    wasm_f32x4_splat(GGML_FP16_TO_FP32(x0->d) * GGML_FP16_TO_FP32(y0->d))));
-    }
-
-    *s = wasm_f32x4_extract_lane(sumv, 0) + wasm_f32x4_extract_lane(sumv, 1) +
-         wasm_f32x4_extract_lane(sumv, 2) + wasm_f32x4_extract_lane(sumv, 3);
-#elif defined(__AVX2__)
-    // Initialize accumulator with zeros
-    __m256 acc = _mm256_setzero_ps();
-
-    // Main loop
-    for (int i = 0; i < nb; i++) {
-        /* Compute combined scale for the block */
-        const __m256 d = _mm256_set1_ps(GGML_FP16_TO_FP32(x[i].d) * GGML_FP16_TO_FP32(y[i].d));
-
-        __m256i bx = bytes_from_nibbles_32(x[i].qs);
-        __m256i bxhi = bytes_from_bits_32(x[i].qh);
-        bxhi = _mm256_andnot_si256(bxhi, _mm256_set1_epi8((char)0xF0));
-        bx = _mm256_or_si256(bx, bxhi);
-
-        __m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs);
-
-        const __m256 q = mul_sum_i8_pairs_float(bx, by);
-
-        /* Multiply q with scale and accumulate */
-        acc = _mm256_fmadd_ps(d, q, acc);
-    }
-
-    *s = hsum_float_8(acc);
-#elif defined(__AVX__)
-    // Initialize accumulator with zeros
-    __m256 acc = _mm256_setzero_ps();
-    __m128i mask = _mm_set1_epi8((char)0xF0);
-
-    // Main loop
-    for (int i = 0; i < nb; i++) {
-        /* Compute combined scale for the block */
-        const __m256 d = _mm256_set1_ps(GGML_FP16_TO_FP32(x[i].d) * GGML_FP16_TO_FP32(y[i].d));
-
-        __m256i bx = bytes_from_nibbles_32(x[i].qs);
-        const __m256i bxhi = bytes_from_bits_32(x[i].qh);
-        __m128i bxhil = _mm256_castsi256_si128(bxhi);
-        __m128i bxhih = _mm256_extractf128_si256(bxhi, 1);
-        bxhil = _mm_andnot_si128(bxhil, mask);
-        bxhih = _mm_andnot_si128(bxhih, mask);
-        __m128i bxl = _mm256_castsi256_si128(bx);
-        __m128i bxh = _mm256_extractf128_si256(bx, 1);
-        bxl = _mm_or_si128(bxl, bxhil);
-        bxh = _mm_or_si128(bxh, bxhih);
-        bx = MM256_SET_M128I(bxh, bxl);
-
-        const __m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs);
-
-        const __m256 q = mul_sum_i8_pairs_float(bx, by);
-
-        /* Multiply q with scale and accumulate */
-        acc = _mm256_add_ps(_mm256_mul_ps(d, q), acc);
-    }
-
-    *s = hsum_float_8(acc);
-#elif defined(__riscv_v_intrinsic)
-    float sumf = 0.0;
-
-    uint32_t qh;
-
-    size_t vl = __riscv_vsetvl_e8m1(qk/2);
-
-    // These tempory registers are for masking and shift operations
-    vuint32m2_t vt_1 = __riscv_vid_v_u32m2(vl);
-    vuint32m2_t vt_2 = __riscv_vsll_vv_u32m2(__riscv_vmv_v_x_u32m2(1, vl), vt_1, vl);
-
-    vuint32m2_t vt_3 = __riscv_vsll_vx_u32m2(vt_2, 16, vl);
-    vuint32m2_t vt_4 = __riscv_vadd_vx_u32m2(vt_1, 12, vl);
-
-    for (int i = 0; i < nb; i++) {
-        memcpy(&qh, x[i].qh, sizeof(uint32_t));
-
-        // ((qh & (1u << (j + 0 ))) >> (j + 0 )) << 4;
-        vuint32m2_t xha_0 = __riscv_vand_vx_u32m2(vt_2, qh, vl);
-        vuint32m2_t xhr_0 = __riscv_vsrl_vv_u32m2(xha_0, vt_1, vl);
-        vuint32m2_t xhl_0 = __riscv_vsll_vx_u32m2(xhr_0, 4, vl);
-
-        // ((qh & (1u << (j + 16))) >> (j + 12));
-        vuint32m2_t xha_1 = __riscv_vand_vx_u32m2(vt_3, qh, vl);
-        vuint32m2_t xhl_1 = __riscv_vsrl_vv_u32m2(xha_1, vt_4, vl);
-
-        // narrowing
-        vuint16m1_t xhc_0 = __riscv_vncvt_x_x_w_u16m1(xhl_0, vl);
-        vuint8mf2_t xh_0 = __riscv_vncvt_x_x_w_u8mf2(xhc_0, vl);
-
-        vuint16m1_t xhc_1 = __riscv_vncvt_x_x_w_u16m1(xhl_1, vl);
-        vuint8mf2_t xh_1 = __riscv_vncvt_x_x_w_u8mf2(xhc_1, vl);
-
-        // load
-        vuint8mf2_t tx = __riscv_vle8_v_u8mf2(x[i].qs, vl);
-
-        vint8mf2_t y0 = __riscv_vle8_v_i8mf2(y[i].qs, vl);
-        vint8mf2_t y1 = __riscv_vle8_v_i8mf2(y[i].qs+16, vl);
-
-        vuint8mf2_t x_at = __riscv_vand_vx_u8mf2(tx, 0x0F, vl);
-        vuint8mf2_t x_lt = __riscv_vsrl_vx_u8mf2(tx, 0x04, vl);
-
-        vuint8mf2_t x_a = __riscv_vor_vv_u8mf2(x_at, xh_0, vl);
-        vuint8mf2_t x_l = __riscv_vor_vv_u8mf2(x_lt, xh_1, vl);
-
-        vint8mf2_t x_ai = __riscv_vreinterpret_v_u8mf2_i8mf2(x_a);
-        vint8mf2_t x_li = __riscv_vreinterpret_v_u8mf2_i8mf2(x_l);
-
-        vint8mf2_t v0 = __riscv_vsub_vx_i8mf2(x_ai, 16, vl);
-        vint8mf2_t v1 = __riscv_vsub_vx_i8mf2(x_li, 16, vl);
-
-        vint16m1_t vec_mul1 = __riscv_vwmul_vv_i16m1(v0, y0, vl);
-        vint16m1_t vec_mul2 = __riscv_vwmul_vv_i16m1(v1, y1, vl);
-
-        vint32m1_t vec_zero = __riscv_vmv_v_x_i32m1(0, vl);
-
-        vint32m1_t vs1 = __riscv_vwredsum_vs_i16m1_i32m1(vec_mul1, vec_zero, vl);
-        vint32m1_t vs2 = __riscv_vwredsum_vs_i16m1_i32m1(vec_mul2, vs1, vl);
-
-        int sumi = __riscv_vmv_x_s_i32m1_i32(vs2);
-
-        sumf += (GGML_FP16_TO_FP32(x[i].d)*GGML_FP16_TO_FP32(y[i].d)) * sumi;
-    }
-
-    *s = sumf;
-#else
-    // scalar
-    float sumf = 0.0;
-
-    for (int i = 0; i < nb; i++) {
-        uint32_t qh;
-        memcpy(&qh, x[i].qh, sizeof(qh));
-
-        int sumi = 0;
-
-        for (int j = 0; j < qk/2; ++j) {
-            const uint8_t xh_0 = ((qh & (1u << (j + 0 ))) >> (j + 0 )) << 4;
-            const uint8_t xh_1 = ((qh & (1u << (j + 16))) >> (j + 12));
-
-            const int32_t x0 = ((x[i].qs[j] & 0x0F) | xh_0) - 16;
-            const int32_t x1 = ((x[i].qs[j] >>   4) | xh_1) - 16;
-
-            sumi += (x0 * y[i].qs[j]) + (x1 * y[i].qs[j + qk/2]);
-        }
-
-        sumf += (GGML_FP16_TO_FP32(x[i].d)*GGML_FP16_TO_FP32(y[i].d)) * sumi;
-    }
-
-    *s = sumf;
-#endif
-}
-
-static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
-    const int qk = QK8_1;
-    const int nb = n / qk;
-
-    assert(n % qk == 0);
-    assert(qk == QK5_1);
-
-    const block_q5_1 * restrict x = vx;
-    const block_q8_1 * restrict y = vy;
-
-#if defined(__ARM_NEON)
-    float32x4_t sumv0 = vdupq_n_f32(0.0f);
-    float32x4_t sumv1 = vdupq_n_f32(0.0f);
-
-    float summs0 = 0.0f;
-    float summs1 = 0.0f;
-
-    uint32_t qh0;
-    uint32_t qh1;
-
-    uint64_t tmp0[4];
-    uint64_t tmp1[4];
-
-    GGML_ASSERT(nb % 2 == 0); // TODO: handle odd nb
-    for (int i = 0; i < nb; i += 2) {
-        const block_q5_1 * restrict x0 = &x[i];
-        const block_q5_1 * restrict x1 = &x[i + 1];
-        const block_q8_1 * restrict y0 = &y[i];
-        const block_q8_1 * restrict y1 = &y[i + 1];
-
-        const uint8x16_t m4b = vdupq_n_u8(0x0F);
-
-        summs0 += GGML_FP16_TO_FP32(x0->m) * y0->s;
-        summs1 += GGML_FP16_TO_FP32(x1->m) * y1->s;
-
-        // extract the 5th bit via lookup table ((b) << 4)
-        memcpy(&qh0, x0->qh, sizeof(qh0));
-        memcpy(&qh1, x1->qh, sizeof(qh1));
-
-        tmp0[0] = table_b2b_0[(qh0 >>  0) & 0xFF];
-        tmp0[1] = table_b2b_0[(qh0 >>  8) & 0xFF];
-        tmp0[2] = table_b2b_0[(qh0 >> 16) & 0xFF];
-        tmp0[3] = table_b2b_0[(qh0 >> 24)       ];
-
-        tmp1[0] = table_b2b_0[(qh1 >>  0) & 0xFF];
-        tmp1[1] = table_b2b_0[(qh1 >>  8) & 0xFF];
-        tmp1[2] = table_b2b_0[(qh1 >> 16) & 0xFF];
-        tmp1[3] = table_b2b_0[(qh1 >> 24)       ];
-
-        const int8x16_t qhl0 = vld1q_s8((const int8_t *)(tmp0 + 0));
-        const int8x16_t qhh0 = vld1q_s8((const int8_t *)(tmp0 + 2));
-        const int8x16_t qhl1 = vld1q_s8((const int8_t *)(tmp1 + 0));
-        const int8x16_t qhh1 = vld1q_s8((const int8_t *)(tmp1 + 2));
-
-        const uint8x16_t v0_0 = vld1q_u8(x0->qs);
-        const uint8x16_t v0_1 = vld1q_u8(x1->qs);
-
-        // 4-bit -> 8-bit
-        const int8x16_t v0_0l = vreinterpretq_s8_u8(vandq_u8  (v0_0, m4b));
-        const int8x16_t v0_0h = vreinterpretq_s8_u8(vshrq_n_u8(v0_0, 4));
-        const int8x16_t v0_1l = vreinterpretq_s8_u8(vandq_u8  (v0_1, m4b));
-        const int8x16_t v0_1h = vreinterpretq_s8_u8(vshrq_n_u8(v0_1, 4));
-
-        // add high bit
-        const int8x16_t v0_0lf = vorrq_s8(v0_0l, qhl0);
-        const int8x16_t v0_0hf = vorrq_s8(v0_0h, qhh0);
-        const int8x16_t v0_1lf = vorrq_s8(v0_1l, qhl1);
-        const int8x16_t v0_1hf = vorrq_s8(v0_1h, qhh1);
-
-        // load y
-        const int8x16_t v1_0l = vld1q_s8(y0->qs);
-        const int8x16_t v1_0h = vld1q_s8(y0->qs + 16);
-        const int8x16_t v1_1l = vld1q_s8(y1->qs);
-        const int8x16_t v1_1h = vld1q_s8(y1->qs + 16);
-
-#if defined(__ARM_FEATURE_DOTPROD)
-        sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(
-                        vdotq_s32(vdupq_n_s32(0), v0_0lf, v1_0l),
-                        vdotq_s32(vdupq_n_s32(0), v0_0hf, v1_0h))), GGML_FP16_TO_FP32(x0->d)*y0->d);
-        sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(
-                        vdotq_s32(vdupq_n_s32(0), v0_1lf, v1_1l),
-                        vdotq_s32(vdupq_n_s32(0), v0_1hf, v1_1h))), GGML_FP16_TO_FP32(x1->d)*y1->d);
-#else
-        const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0_0lf), vget_low_s8 (v1_0l));
-        const int16x8_t pl0h = vmull_s8(vget_high_s8(v0_0lf), vget_high_s8(v1_0l));
-        const int16x8_t ph0l = vmull_s8(vget_low_s8 (v0_0hf), vget_low_s8 (v1_0h));
-        const int16x8_t ph0h = vmull_s8(vget_high_s8(v0_0hf), vget_high_s8(v1_0h));
-
-        const int16x8_t pl1l = vmull_s8(vget_low_s8 (v0_1lf), vget_low_s8 (v1_1l));
-        const int16x8_t pl1h = vmull_s8(vget_high_s8(v0_1lf), vget_high_s8(v1_1l));
-        const int16x8_t ph1l = vmull_s8(vget_low_s8 (v0_1hf), vget_low_s8 (v1_1h));
-        const int16x8_t ph1h = vmull_s8(vget_high_s8(v0_1hf), vget_high_s8(v1_1h));
-
-        const int32x4_t pl0 = vaddq_s32(vpaddlq_s16(pl0l), vpaddlq_s16(pl0h));
-        const int32x4_t ph0 = vaddq_s32(vpaddlq_s16(ph0l), vpaddlq_s16(ph0h));
-        const int32x4_t pl1 = vaddq_s32(vpaddlq_s16(pl1l), vpaddlq_s16(pl1h));
-        const int32x4_t ph1 = vaddq_s32(vpaddlq_s16(ph1l), vpaddlq_s16(ph1h));
-
-        sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(pl0, ph0)), GGML_FP16_TO_FP32(x0->d)*y0->d);
-        sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(pl1, ph1)), GGML_FP16_TO_FP32(x1->d)*y1->d);
-#endif
-    }
-
-    *s = vaddvq_f32(sumv0) + vaddvq_f32(sumv1) + summs0 + summs1;
-#elif defined(__wasm_simd128__)
-    v128_t sumv = wasm_f32x4_splat(0.0f);
-
-    float summs = 0.0f;
-
-    uint32_t qh;
-    uint64_t tmp[4];
-
-    // TODO: check if unrolling this is better
-    for (int i = 0; i < nb; ++i) {
-        const block_q5_1 * restrict x0 = &x[i];
-        const block_q8_1 * restrict y0 = &y[i];
-
-        summs += GGML_FP16_TO_FP32(x0->m) * y0->s;
-
-        const v128_t m4b = wasm_i8x16_splat(0x0F);
-
-        // extract the 5th bit
-        memcpy(&qh, x0->qh, sizeof(qh));
-
-        tmp[0] = table_b2b_0[(qh >>  0) & 0xFF];
-        tmp[1] = table_b2b_0[(qh >>  8) & 0xFF];
-        tmp[2] = table_b2b_0[(qh >> 16) & 0xFF];
-        tmp[3] = table_b2b_0[(qh >> 24)       ];
-
-        const v128_t qhl = wasm_v128_load(tmp + 0);
-        const v128_t qhh = wasm_v128_load(tmp + 2);
-
-        const v128_t v0 = wasm_v128_load(x0->qs);
-
-        // 4-bit -> 8-bit
-        const v128_t v0l = wasm_v128_and (v0, m4b);
-        const v128_t v0h = wasm_u8x16_shr(v0, 4);
-
-        // add high bit
-        const v128_t v0lf = wasm_v128_or(v0l, qhl);
-        const v128_t v0hf = wasm_v128_or(v0h, qhh);
-
-        // load y
-        const v128_t v1l = wasm_v128_load(y0->qs);
-        const v128_t v1h = wasm_v128_load(y0->qs + 16);
-
-        // int8x16 -> int16x8
-        const v128_t v0lfl = wasm_i16x8_extend_low_i8x16 (v0lf);
-        const v128_t v0lfh = wasm_i16x8_extend_high_i8x16(v0lf);
-        const v128_t v0hfl = wasm_i16x8_extend_low_i8x16 (v0hf);
-        const v128_t v0hfh = wasm_i16x8_extend_high_i8x16(v0hf);
-
-        const v128_t v1ll = wasm_i16x8_extend_low_i8x16 (v1l);
-        const v128_t v1lh = wasm_i16x8_extend_high_i8x16(v1l);
-        const v128_t v1hl = wasm_i16x8_extend_low_i8x16 (v1h);
-        const v128_t v1hh = wasm_i16x8_extend_high_i8x16(v1h);
-
-        // dot product
-        sumv = wasm_f32x4_add(sumv,
-                wasm_f32x4_mul(wasm_f32x4_convert_i32x4(wasm_i32x4_add(
-                            wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0lfl, v1ll),
-                                           wasm_i32x4_dot_i16x8(v0lfh, v1lh)),
-                            wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0hfl, v1hl),
-                                           wasm_i32x4_dot_i16x8(v0hfh, v1hh)))),
-                    wasm_f32x4_splat(GGML_FP16_TO_FP32(x0->d) * y0->d)));
-    }
-
-    *s = wasm_f32x4_extract_lane(sumv, 0) + wasm_f32x4_extract_lane(sumv, 1) +
-         wasm_f32x4_extract_lane(sumv, 2) + wasm_f32x4_extract_lane(sumv, 3) + summs;
-#elif defined(__AVX2__)
-    // Initialize accumulator with zeros
-    __m256 acc = _mm256_setzero_ps();
-
-    float summs = 0.0f;
-
-    // Main loop
-    for (int i = 0; i < nb; i++) {
-        const __m256 dx = _mm256_set1_ps(GGML_FP16_TO_FP32(x[i].d));
-
-        summs += GGML_FP16_TO_FP32(x[i].m) * y[i].s;
-
-        __m256i bx = bytes_from_nibbles_32(x[i].qs);
-        __m256i bxhi = bytes_from_bits_32(x[i].qh);
-        bxhi = _mm256_and_si256(bxhi, _mm256_set1_epi8(0x10));
-        bx = _mm256_or_si256(bx, bxhi);
-
-        const __m256 dy = _mm256_set1_ps(y[i].d);
-        const __m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs);
-
-        const __m256 q = mul_sum_us8_pairs_float(bx, by);
-
-        acc = _mm256_fmadd_ps(q, _mm256_mul_ps(dx, dy), acc);
-    }
-
-    *s = hsum_float_8(acc) + summs;
-#elif defined(__AVX__)
-    // Initialize accumulator with zeros
-    __m256 acc = _mm256_setzero_ps();
-    __m128i mask = _mm_set1_epi8(0x10);
-
-    float summs = 0.0f;
-
-    // Main loop
-    for (int i = 0; i < nb; i++) {
-        const __m256 dx = _mm256_set1_ps(GGML_FP16_TO_FP32(x[i].d));
-
-        summs += GGML_FP16_TO_FP32(x[i].m) * y[i].s;
-
-        __m256i bx = bytes_from_nibbles_32(x[i].qs);
-        const __m256i bxhi = bytes_from_bits_32(x[i].qh);
-        __m128i bxhil = _mm256_castsi256_si128(bxhi);
-        __m128i bxhih = _mm256_extractf128_si256(bxhi, 1);
-        bxhil = _mm_and_si128(bxhil, mask);
-        bxhih = _mm_and_si128(bxhih, mask);
-        __m128i bxl = _mm256_castsi256_si128(bx);
-        __m128i bxh = _mm256_extractf128_si256(bx, 1);
-        bxl = _mm_or_si128(bxl, bxhil);
-        bxh = _mm_or_si128(bxh, bxhih);
-        bx = MM256_SET_M128I(bxh, bxl);
-
-        const __m256 dy = _mm256_set1_ps(y[i].d);
-        const __m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs);
-
-        const __m256 q = mul_sum_us8_pairs_float(bx, by);
-
-        acc = _mm256_add_ps(_mm256_mul_ps(q, _mm256_mul_ps(dx, dy)), acc);
-    }
-
-    *s = hsum_float_8(acc) + summs;
-#elif defined(__riscv_v_intrinsic)
-    float sumf = 0.0;
-
-    uint32_t qh;
-
-    size_t vl = __riscv_vsetvl_e8m1(qk/2);
-
-    // temporary registers for shift operations
-    vuint32m2_t vt_1 = __riscv_vid_v_u32m2(vl);
-    vuint32m2_t vt_2 = __riscv_vadd_vx_u32m2(vt_1, 12, vl);
-
-    for (int i = 0; i < nb; i++) {
-        memcpy(&qh, x[i].qh, sizeof(uint32_t));
-
-        // load qh
-        vuint32m2_t vqh = __riscv_vmv_v_x_u32m2(qh, vl);
-
-        // ((qh >> (j +  0)) << 4) & 0x10;
-        vuint32m2_t xhr_0 = __riscv_vsrl_vv_u32m2(vqh, vt_1, vl);
-        vuint32m2_t xhl_0 = __riscv_vsll_vx_u32m2(xhr_0, 4, vl);
-        vuint32m2_t xha_0 = __riscv_vand_vx_u32m2(xhl_0, 0x10, vl);
-
-        // ((qh >> (j + 12))     ) & 0x10;
-        vuint32m2_t xhr_1 = __riscv_vsrl_vv_u32m2(vqh, vt_2, vl);
-        vuint32m2_t xha_1 = __riscv_vand_vx_u32m2(xhr_1, 0x10, vl);
-
-        // narrowing
-        vuint16m1_t xhc_0 = __riscv_vncvt_x_x_w_u16m1(xha_0, vl);
-        vuint8mf2_t xh_0 = __riscv_vncvt_x_x_w_u8mf2(xhc_0, vl);
-
-        vuint16m1_t xhc_1 = __riscv_vncvt_x_x_w_u16m1(xha_1, vl);
-        vuint8mf2_t xh_1 = __riscv_vncvt_x_x_w_u8mf2(xhc_1, vl);
-
-        // load
-        vuint8mf2_t tx = __riscv_vle8_v_u8mf2(x[i].qs, vl);
-
-        vint8mf2_t y0 = __riscv_vle8_v_i8mf2(y[i].qs, vl);
-        vint8mf2_t y1 = __riscv_vle8_v_i8mf2(y[i].qs+16, vl);
-
-        vuint8mf2_t x_at = __riscv_vand_vx_u8mf2(tx, 0x0F, vl);
-        vuint8mf2_t x_lt = __riscv_vsrl_vx_u8mf2(tx, 0x04, vl);
-
-        vuint8mf2_t x_a = __riscv_vor_vv_u8mf2(x_at, xh_0, vl);
-        vuint8mf2_t x_l = __riscv_vor_vv_u8mf2(x_lt, xh_1, vl);
-
-        vint8mf2_t v0 = __riscv_vreinterpret_v_u8mf2_i8mf2(x_a);
-        vint8mf2_t v1 = __riscv_vreinterpret_v_u8mf2_i8mf2(x_l);
-
-        vint16m1_t vec_mul1 = __riscv_vwmul_vv_i16m1(v0, y0, vl);
-        vint16m1_t vec_mul2 = __riscv_vwmul_vv_i16m1(v1, y1, vl);
-
-        vint32m1_t vec_zero = __riscv_vmv_v_x_i32m1(0, vl);
-
-        vint32m1_t vs1 = __riscv_vwredsum_vs_i16m1_i32m1(vec_mul1, vec_zero, vl);
-        vint32m1_t vs2 = __riscv_vwredsum_vs_i16m1_i32m1(vec_mul2, vs1, vl);
-
-        int sumi = __riscv_vmv_x_s_i32m1_i32(vs2);
-
-        sumf += (GGML_FP16_TO_FP32(x[i].d)*y[i].d)*sumi + GGML_FP16_TO_FP32(x[i].m)*y[i].s;
-    }
-
-    *s = sumf;
-#else
-    // scalar
-    float sumf = 0.0;
-
-    for (int i = 0; i < nb; i++) {
-        uint32_t qh;
-        memcpy(&qh, x[i].qh, sizeof(qh));
-
-        int sumi = 0;
-
-        for (int j = 0; j < qk/2; ++j) {
-            const uint8_t xh_0 = ((qh >> (j +  0)) << 4) & 0x10;
-            const uint8_t xh_1 = ((qh >> (j + 12))     ) & 0x10;
-
-            const int32_t x0 = (x[i].qs[j] & 0xF) | xh_0;
-            const int32_t x1 = (x[i].qs[j] >>  4) | xh_1;
-
-            sumi += (x0 * y[i].qs[j]) + (x1 * y[i].qs[j + qk/2]);
-        }
-
-        sumf += (GGML_FP16_TO_FP32(x[i].d)*y[i].d)*sumi + GGML_FP16_TO_FP32(x[i].m)*y[i].s;
-    }
-
-    *s = sumf;
-#endif
-}
-
-static void ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
-    const int qk = QK8_0;
-    const int nb = n / qk;
-
-    assert(n % qk == 0);
-
-    const block_q8_0 * restrict x = vx;
-    const block_q8_0 * restrict y = vy;
-
-#if defined(__ARM_NEON)
-    float32x4_t sumv0 = vdupq_n_f32(0.0f);
-    float32x4_t sumv1 = vdupq_n_f32(0.0f);
-
-    GGML_ASSERT(nb % 2 == 0); // TODO: handle odd nb
-    for (int i = 0; i < nb; i += 2) {
-        const block_q8_0 * restrict x0 = &x[i + 0];
-        const block_q8_0 * restrict x1 = &x[i + 1];
-        const block_q8_0 * restrict y0 = &y[i + 0];
-        const block_q8_0 * restrict y1 = &y[i + 1];
-
-        const int8x16_t x0_0 = vld1q_s8(x0->qs);
-        const int8x16_t x0_1 = vld1q_s8(x0->qs + 16);
-        const int8x16_t x1_0 = vld1q_s8(x1->qs);
-        const int8x16_t x1_1 = vld1q_s8(x1->qs + 16);
-
-        // load y
-        const int8x16_t y0_0 = vld1q_s8(y0->qs);
-        const int8x16_t y0_1 = vld1q_s8(y0->qs + 16);
-        const int8x16_t y1_0 = vld1q_s8(y1->qs);
-        const int8x16_t y1_1 = vld1q_s8(y1->qs + 16);
-
-#if defined(__ARM_FEATURE_DOTPROD)
-        sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(
-                        vdotq_s32(vdupq_n_s32(0), x0_0, y0_0),
-                        vdotq_s32(vdupq_n_s32(0), x0_1, y0_1))), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
-
-        sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(
-                        vdotq_s32(vdupq_n_s32(0), x1_0, y1_0),
-                        vdotq_s32(vdupq_n_s32(0), x1_1, y1_1))), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
-
-#else
-        const int16x8_t p0_0 = vmull_s8(vget_low_s8 (x0_0), vget_low_s8 (y0_0));
-        const int16x8_t p0_1 = vmull_s8(vget_high_s8(x0_0), vget_high_s8(y0_0));
-        const int16x8_t p0_2 = vmull_s8(vget_low_s8 (x0_1), vget_low_s8 (y0_1));
-        const int16x8_t p0_3 = vmull_s8(vget_high_s8(x0_1), vget_high_s8(y0_1));
-
-        const int16x8_t p1_0 = vmull_s8(vget_low_s8 (x1_0), vget_low_s8 (y1_0));
-        const int16x8_t p1_1 = vmull_s8(vget_high_s8(x1_0), vget_high_s8(y1_0));
-        const int16x8_t p1_2 = vmull_s8(vget_low_s8 (x1_1), vget_low_s8 (y1_1));
-        const int16x8_t p1_3 = vmull_s8(vget_high_s8(x1_1), vget_high_s8(y1_1));
-
-        const int32x4_t p0 = vaddq_s32(vpaddlq_s16(p0_0), vpaddlq_s16(p0_1));
-        const int32x4_t p1 = vaddq_s32(vpaddlq_s16(p0_2), vpaddlq_s16(p0_3));
-        const int32x4_t p2 = vaddq_s32(vpaddlq_s16(p1_0), vpaddlq_s16(p1_1));
-        const int32x4_t p3 = vaddq_s32(vpaddlq_s16(p1_2), vpaddlq_s16(p1_3));
-
-        sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(p0, p1)), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
-        sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(p2, p3)), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
-#endif
-    }
-
-    *s = vaddvq_f32(sumv0) + vaddvq_f32(sumv1);
-#elif defined(__AVX2__) || defined(__AVX__)
-    // Initialize accumulator with zeros
-    __m256 acc = _mm256_setzero_ps();
-
-    // Main loop
-    for (int i = 0; i < nb; ++i) {
-        // Compute combined scale for the block
-        const __m256 d = _mm256_set1_ps(GGML_FP16_TO_FP32(x[i].d) * GGML_FP16_TO_FP32(y[i].d));
-        __m256i bx = _mm256_loadu_si256((const __m256i *)x[i].qs);
-        __m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs);
-
-        const __m256 q = mul_sum_i8_pairs_float(bx, by);
-
-        // Multiply q with scale and accumulate
-#if defined(__AVX2__)
-        acc = _mm256_fmadd_ps( d, q, acc );
-#else
-        acc = _mm256_add_ps( _mm256_mul_ps( d, q ), acc );
-#endif
-    }
-
-    *s = hsum_float_8(acc);
-#elif defined(__riscv_v_intrinsic)
-    float sumf = 0.0;
-    size_t vl = __riscv_vsetvl_e8m1(qk);
-
-    for (int i = 0; i < nb; i++) {
-        // load elements
-        vint8m1_t bx = __riscv_vle8_v_i8m1(x[i].qs, vl);
-        vint8m1_t by = __riscv_vle8_v_i8m1(y[i].qs, vl);
-
-        vint16m2_t vw_mul = __riscv_vwmul_vv_i16m2(bx, by, vl);
-
-        vint32m1_t v_zero = __riscv_vmv_v_x_i32m1(0, vl);
-        vint32m1_t v_sum = __riscv_vwredsum_vs_i16m2_i32m1(vw_mul, v_zero, vl);
-
-        int sumi = __riscv_vmv_x_s_i32m1_i32(v_sum);
-
-        sumf += sumi*(GGML_FP16_TO_FP32(x[i].d)*GGML_FP16_TO_FP32(y[i].d));
-    }
-
-    *s = sumf;
-#else
-    // scalar
-    float sumf = 0.0;
-
-    for (int i = 0; i < nb; i++) {
-        int sumi = 0;
-
-        for (int j = 0; j < qk; j++) {
-            sumi += x[i].qs[j]*y[i].qs[j];
-        }
-
-        sumf += sumi*(GGML_FP16_TO_FP32(x[i].d)*GGML_FP16_TO_FP32(y[i].d));
-    }
-
-    *s = sumf;
-#endif
-}
-
 // compute GGML_VEC_DOT_UNROLL dot products at once
 // xs - x row stride in bytes
 inline static void ggml_vec_dot_f16_unroll(const int n, const int xs, float * restrict s, void * restrict xv, ggml_fp16_t * restrict y) {
@@ -3848,7 +1364,7 @@ inline static float ggml_gelu_f32(float x) {
 inline static void ggml_vec_gelu_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
     const uint16_t * i16 = (const uint16_t *) x;
     for (int i = 0; i < n; ++i) {
-        y[i] = table_gelu_f16[i16[i]];
+        y[i] = ggml_table_gelu_f16[i16[i]];
     }
 }
 
@@ -3858,7 +1374,7 @@ inline static void ggml_vec_gelu_f32(const int n, float * y, const float * x) {
     for (int i = 0; i < n; ++i) {
         ggml_fp16_t fp16 = GGML_FP32_TO_FP16(x[i]);
         memcpy(&t, &fp16, sizeof(uint16_t));
-        y[i] = GGML_FP16_TO_FP32(table_gelu_f16[t]);
+        y[i] = GGML_FP16_TO_FP32(ggml_table_gelu_f16[t]);
     }
 }
 #else
@@ -3876,7 +1392,7 @@ inline static float ggml_gelu_quick_f32(float x) {
 //inline static void ggml_vec_gelu_quick_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
 //    const uint16_t * i16 = (const uint16_t *) x;
 //    for (int i = 0; i < n; ++i) {
-//        y[i] = table_gelu_quick_f16[i16[i]];
+//        y[i] = ggml_table_gelu_quick_f16[i16[i]];
 //    }
 //}
 
@@ -3886,7 +1402,7 @@ inline static void ggml_vec_gelu_quick_f32(const int n, float * y, const float *
     for (int i = 0; i < n; ++i) {
         ggml_fp16_t fp16 = GGML_FP32_TO_FP16(x[i]);
         memcpy(&t, &fp16, sizeof(uint16_t));
-        y[i] = GGML_FP16_TO_FP32(table_gelu_quick_f16[t]);
+        y[i] = GGML_FP16_TO_FP32(ggml_table_gelu_quick_f16[t]);
     }
 }
 #else
@@ -3905,7 +1421,7 @@ inline static float ggml_silu_f32(float x) {
 //inline static void ggml_vec_silu_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
 //    const uint16_t * i16 = (const uint16_t *) x;
 //    for (int i = 0; i < n; ++i) {
-//        y[i] = table_silu_f16[i16[i]];
+//        y[i] = ggml_table_silu_f16[i16[i]];
 //    }
 //}
 
@@ -3915,7 +1431,7 @@ inline static void ggml_vec_silu_f32(const int n, float * y, const float * x) {
     for (int i = 0; i < n; ++i) {
         ggml_fp16_t fp16 = GGML_FP32_TO_FP16(x[i]);
         memcpy(&t, &fp16, sizeof(uint16_t));
-        y[i] = GGML_FP16_TO_FP32(table_silu_f16[t]);
+        y[i] = GGML_FP16_TO_FP32(ggml_table_silu_f16[t]);
     }
 }
 #else
@@ -4057,16 +1573,17 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
     "ALIBI",
     "CLAMP",
     "CONV_1D",
+    "CONV_1D_STAGE_0",
+    "CONV_1D_STAGE_1",
     "CONV_TRANSPOSE_1D",
     "CONV_2D",
+    "CONV_2D_STAGE_0",
+    "CONV_2D_STAGE_1",
     "CONV_TRANSPOSE_2D",
     "POOL_1D",
     "POOL_2D",
     "UPSCALE",
 
-    "CONV_1D_STAGE_0",
-    "CONV_1D_STAGE_1",
-
     "FLASH_ATTN",
     "FLASH_FF",
     "FLASH_ATTN_BACK",
@@ -4092,7 +1609,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
     "CROSS_ENTROPY_LOSS_BACK",
 };
 
-static_assert(GGML_OP_COUNT == 71, "GGML_OP_COUNT != 71");
+static_assert(GGML_OP_COUNT == 73, "GGML_OP_COUNT != 73");
 
 static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
     "none",
@@ -4143,16 +1660,17 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
     "alibi(x)",
     "clamp(x)",
     "conv_1d(x)",
+    "conv_1d_stage_0(x)",
+    "conv_1d_stage_1(x)",
     "conv_transpose_1d(x)",
     "conv_2d(x)",
+    "conv_2d_stage_0(x)",
+    "conv_2d_stage_1(x)",
     "conv_transpose_2d(x)",
     "pool_1d(x)",
     "pool_2d(x)",
     "upscale(x)",
 
-    "conv_1d_stage_0(x)",
-    "conv_1d_stage_1(x)",
-
     "flash_attn(x)",
     "flash_ff(x)",
     "flash_attn_back(x)",
@@ -4178,7 +1696,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
     "cross_entropy_loss_back(x,y)",
 };
 
-static_assert(GGML_OP_COUNT == 71, "GGML_OP_COUNT != 71");
+static_assert(GGML_OP_COUNT == 73, "GGML_OP_COUNT != 73");
 
 static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
 
@@ -4209,8 +1727,10 @@ static void ggml_setup_op_has_task_pass(void) {
         p[GGML_OP_CONV_1D                ] = true;
         p[GGML_OP_CONV_1D_STAGE_0        ] = true;
         p[GGML_OP_CONV_1D_STAGE_1        ] = true;
-        p[GGML_OP_CONV_2D                ] = true;
         p[GGML_OP_CONV_TRANSPOSE_1D      ] = true;
+        p[GGML_OP_CONV_2D                ] = true;
+        p[GGML_OP_CONV_2D_STAGE_0        ] = true;
+        p[GGML_OP_CONV_2D_STAGE_1        ] = true;
         p[GGML_OP_CONV_TRANSPOSE_2D      ] = true;
         p[GGML_OP_FLASH_ATTN_BACK        ] = true;
         p[GGML_OP_CROSS_ENTROPY_LOSS     ] = true;
@@ -4627,11 +2147,11 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
             for (int i = 0; i < (1 << 16); ++i) {
                 uint16_t ui = i;
                 memcpy(&ii, &ui, sizeof(ii));
-                const float f = table_f32_f16[i] = GGML_COMPUTE_FP16_TO_FP32(ii);
-                table_gelu_f16[i] = GGML_FP32_TO_FP16(ggml_gelu_f32(f));
-                table_gelu_quick_f16[i] = GGML_FP32_TO_FP16(ggml_gelu_quick_f32(f));
-                table_silu_f16[i] = GGML_FP32_TO_FP16(ggml_silu_f32(f));
-                table_exp_f16[i]  = GGML_FP32_TO_FP16(expf(f));
+                const float f = ggml_table_f32_f16[i] = GGML_COMPUTE_FP16_TO_FP32(ii);
+                ggml_table_gelu_f16[i] = GGML_FP32_TO_FP16(ggml_gelu_f32(f));
+                ggml_table_gelu_quick_f16[i] = GGML_FP32_TO_FP16(ggml_gelu_quick_f32(f));
+                ggml_table_silu_f16[i] = GGML_FP32_TO_FP16(ggml_silu_f32(f));
+                ggml_table_exp_f16[i]  = GGML_FP32_TO_FP16(expf(f));
             }
 
             const uint64_t t_end = ggml_time_us(); UNUSED(t_end);
@@ -5494,6 +3014,39 @@ struct ggml_tensor * ggml_view_tensor(
     return result;
 }
 
+struct ggml_tensor * ggml_get_first_tensor(struct ggml_context * ctx) {
+    struct ggml_object * obj = ctx->objects_begin;
+
+    char * const mem_buffer = ctx->mem_buffer;
+
+    while (obj != NULL) {
+        if (obj->type == GGML_OBJECT_TENSOR) {
+            return (struct ggml_tensor *)(mem_buffer + obj->offs);
+        }
+
+        obj = obj->next;
+    }
+
+    return NULL;
+}
+
+struct ggml_tensor * ggml_get_next_tensor(struct ggml_context * ctx, struct ggml_tensor * tensor) {
+    struct ggml_object * obj = (struct ggml_object *) ((char *)tensor - GGML_OBJECT_SIZE);
+    obj = obj->next;
+
+    char * const mem_buffer = ctx->mem_buffer;
+
+    while (obj != NULL) {
+        if (obj->type == GGML_OBJECT_TENSOR) {
+            return (struct ggml_tensor *)(mem_buffer + obj->offs);
+        }
+
+        obj = obj->next;
+    }
+
+    return NULL;
+}
+
 struct ggml_tensor * ggml_get_tensor(struct ggml_context * ctx, const char * name) {
     struct ggml_object * obj = ctx->objects_begin;
 
@@ -5601,7 +3154,7 @@ static struct ggml_tensor * ggml_add_cast_impl(
     // TODO: support less-strict constraint
     //       GGML_ASSERT(ggml_can_repeat(b, a));
     GGML_ASSERT(ggml_can_repeat_rows(b, a));
-    GGML_ASSERT(ggml_is_quantized(a->type)); // currently only supported for quantized input
+    GGML_ASSERT(ggml_is_quantized(a->type) || a->type == GGML_TYPE_F16); // currently only supported for quantized input and f16
 
     bool is_node = false;
 
@@ -5921,7 +3474,6 @@ struct ggml_tensor * ggml_sqrt_inplace(
     return ggml_sqrt_impl(ctx, a, true);
 }
 
-
 // ggml_log
 
 static struct ggml_tensor * ggml_log_impl(
@@ -5975,7 +3527,6 @@ struct ggml_tensor * ggml_sum(
     return result;
 }
 
-
 // ggml_sum_rows
 
 struct ggml_tensor * ggml_sum_rows(
@@ -6607,7 +4158,6 @@ struct ggml_tensor * ggml_set_2d_inplace(
     return ggml_set_impl(ctx, a, b, nb1, a->nb[2], a->nb[3], offset, false);
 }
 
-
 // ggml_cpy
 
 static struct ggml_tensor * ggml_cpy_impl(
@@ -6687,7 +4237,6 @@ struct ggml_tensor * ggml_cont_inplace(
     return ggml_cont_impl(ctx, a, true);
 }
 
-
 // make contiguous, with new shape
 GGML_API struct ggml_tensor * ggml_cont_1d(
         struct ggml_context * ctx,
@@ -7140,7 +4689,6 @@ struct ggml_tensor * ggml_diag(
     return result;
 }
 
-
 // ggml_diag_mask_inf
 
 static struct ggml_tensor * ggml_diag_mask_inf_impl(
@@ -7252,7 +4800,6 @@ struct ggml_tensor * ggml_soft_max_inplace(
     return ggml_soft_max_impl(ctx, a, true);
 }
 
-
 // ggml_soft_max_back
 
 static struct ggml_tensor * ggml_soft_max_back_impl(
@@ -7299,8 +4846,13 @@ static struct ggml_tensor * ggml_rope_impl(
         int                   n_dims,
         int                   mode,
         int                   n_ctx,
+        int                   n_orig_ctx,
         float                 freq_base,
         float                 freq_scale,
+        float                 ext_factor,
+        float                 attn_factor,
+        float                 beta_fast,
+        float                 beta_slow,
         float                 xpos_base,
         bool                  xpos_down,
         bool                  inplace) {
@@ -7316,11 +4868,15 @@ static struct ggml_tensor * ggml_rope_impl(
 
     struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
 
-    int32_t params[8] = { /*n_past*/ 0, n_dims, mode, n_ctx };
-    memcpy(params + 4, &freq_base,  sizeof(float));
-    memcpy(params + 5, &freq_scale, sizeof(float));
-    memcpy(params + 6, &xpos_base,  sizeof(float));
-    memcpy(params + 7, &xpos_down,  sizeof(bool));
+    int32_t params[13] = { /*n_past*/ 0, n_dims, mode, n_ctx, n_orig_ctx };
+    memcpy(params +  5, &freq_base,    sizeof(float));
+    memcpy(params +  6, &freq_scale,   sizeof(float));
+    memcpy(params +  7, &ext_factor,   sizeof(float));
+    memcpy(params +  8, &attn_factor,  sizeof(float));
+    memcpy(params +  9, &beta_fast,    sizeof(float));
+    memcpy(params + 10, &beta_slow,    sizeof(float));
+    memcpy(params + 11, &xpos_base,    sizeof(float));
+    memcpy(params + 12, &xpos_down,    sizeof(bool));
     ggml_set_op_params(result, params, sizeof(params));
 
     result->op   = GGML_OP_ROPE;
@@ -7338,7 +4894,9 @@ struct ggml_tensor * ggml_rope(
         int                   n_dims,
         int                   mode,
         int                   n_ctx) {
-    return ggml_rope_impl(ctx, a, b, n_dims, mode, n_ctx, 10000.0f, 1.0f, 0.0f, false, false);
+    return ggml_rope_impl(
+        ctx, a, b, n_dims, mode, n_ctx, 0, 10000.0f, 1.0f, 0.0f, 1.0f, 0.0f, 0.0f, 0.0f, false, false
+    );
 }
 
 struct ggml_tensor * ggml_rope_inplace(
@@ -7348,7 +4906,9 @@ struct ggml_tensor * ggml_rope_inplace(
         int                   n_dims,
         int                   mode,
         int                   n_ctx) {
-    return ggml_rope_impl(ctx, a, b, n_dims, mode, n_ctx, 10000.0f, 1.0f, 0.0f, false, true);
+    return ggml_rope_impl(
+        ctx, a, b, n_dims, mode, n_ctx, 0, 10000.0f, 1.0f, 0.0f, 1.0f, 0.0f, 0.0f, 0.0f, false, true
+    );
 }
 
 struct ggml_tensor * ggml_rope_custom(
@@ -7358,9 +4918,17 @@ struct ggml_tensor * ggml_rope_custom(
         int                   n_dims,
         int                   mode,
         int                   n_ctx,
+        int                   n_orig_ctx,
         float                 freq_base,
-        float                 freq_scale) {
-    return ggml_rope_impl(ctx, a, b, n_dims, mode, n_ctx, freq_base, freq_scale, 0.0f, false, false);
+        float                 freq_scale,
+        float                 ext_factor,
+        float                 attn_factor,
+        float                 beta_fast,
+        float                 beta_slow) {
+    return ggml_rope_impl(
+        ctx, a, b, n_dims, mode, n_ctx, n_orig_ctx, freq_base, freq_scale,
+        ext_factor, attn_factor, beta_fast, beta_slow, 0.0f, false, false
+    );
 }
 
 struct ggml_tensor * ggml_rope_custom_inplace(
@@ -7370,9 +4938,17 @@ struct ggml_tensor * ggml_rope_custom_inplace(
         int                   n_dims,
         int                   mode,
         int                   n_ctx,
+        int                   n_orig_ctx,
         float                 freq_base,
-        float                 freq_scale) {
-    return ggml_rope_impl(ctx, a, b, n_dims, mode, n_ctx, freq_base, freq_scale, 0.0f, false, true);
+        float                 freq_scale,
+        float                 ext_factor,
+        float                 attn_factor,
+        float                 beta_fast,
+        float                 beta_slow) {
+    return ggml_rope_impl(
+        ctx, a, b, n_dims, mode, n_ctx, n_orig_ctx, freq_base, freq_scale,
+        ext_factor, attn_factor, beta_fast, beta_slow, 0.0f, false, true
+    );
 }
 
 struct ggml_tensor * ggml_rope_xpos_inplace(
@@ -7382,7 +4958,7 @@ struct ggml_tensor * ggml_rope_xpos_inplace(
         int                   n_dims,
         float                 base,
         bool                  down) {
-    return ggml_rope_impl(ctx, a, b, n_dims, 0, 0, 10000.0f, 1.0f, base, down, true);
+    return ggml_rope_impl(ctx, a, b, n_dims, 0, 0, 0, 10000.0f, 1.0f, 0.0f, 1.0f, 0.0f, 0.0f, base, down, true);
 }
 
 // ggml_rope_back
@@ -7669,7 +5245,11 @@ GGML_API struct ggml_tensor * ggml_conv_transpose_1d(
 
 // ggml_conv_2d
 
-struct ggml_tensor * ggml_conv_2d(
+// im2col: [N, IC, IH, IW] => [N, OH, OW, IC*KH*KW]
+// a: [OC，IC, KH, KW]
+// b: [N, IC, IH, IW]
+// result: [N, OH, OW, IC*KH*KW]
+static struct ggml_tensor * ggml_conv_2d_stage_0(
     struct ggml_context * ctx,
     struct ggml_tensor  * a,
     struct ggml_tensor  * b,
@@ -7688,17 +5268,21 @@ struct ggml_tensor * ggml_conv_2d(
         is_node = true;
     }
 
+    const int64_t OH = ggml_calc_conv_output_size(b->ne[1], a->ne[1], s1, p1, d1);
+    const int64_t OW = ggml_calc_conv_output_size(b->ne[0], a->ne[0], s0, p0, d0);
+
     const int64_t ne[4] = {
-        ggml_calc_conv_output_size(b->ne[0], a->ne[0], s0, p0, d0),
-        ggml_calc_conv_output_size(b->ne[1], a->ne[1], s1, p1, d1),
-        a->ne[3], b->ne[3],
+        a->ne[2] * a->ne[1] * a->ne[0],
+        OW,
+        OH,
+        b->ne[3],
     };
-    struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
+    struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F16, 4, ne);
 
     int32_t params[] = { s0, s1, p0, p1, d0, d1 };
     ggml_set_op_params(result, params, sizeof(params));
 
-    result->op = GGML_OP_CONV_2D;
+    result->op = GGML_OP_CONV_2D_STAGE_0;
     result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
     result->src[0] = a;
     result->src[1] = b;
@@ -7707,8 +5291,61 @@ struct ggml_tensor * ggml_conv_2d(
 
 }
 
-// ggml_conv_2d_sk_p0
+// gemm: [N, OC, OH, OW] = [OC, IC * KH * KW] x [N*OH*OW, IC * KH * KW]
+// a: [OC, IC, KH, KW]
+// b: [N, OH, OW, IC * KH * KW]
+// result: [N, OC, OH, OW]
+static struct ggml_tensor * ggml_conv_2d_stage_1(
+    struct ggml_context * ctx,
+    struct ggml_tensor  * a,
+    struct ggml_tensor  * b) {
 
+    bool is_node = false;
+
+    if (a->grad || b->grad) {
+        GGML_ASSERT(false); // TODO: implement backward
+        is_node = true;
+    }
+
+    const int64_t ne[4] = {
+        b->ne[1],
+        b->ne[2],
+        a->ne[3],
+        b->ne[3],
+    };
+    struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
+
+    result->op = GGML_OP_CONV_2D_STAGE_1;
+    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
+    result->src[0] = a;
+    result->src[1] = b;
+
+    return result;
+
+}
+
+// a: [OC，IC, KH, KW]
+// b: [N, IC, IH, IW]
+// result: [N, OC, OH, OW]
+struct ggml_tensor * ggml_conv_2d(
+    struct ggml_context * ctx,
+    struct ggml_tensor  * a,
+    struct ggml_tensor  * b,
+    int                  s0,
+    int                  s1,
+    int                  p0,
+    int                  p1,
+    int                  d0,
+    int                  d1) {
+
+    struct ggml_tensor * result = ggml_conv_2d_stage_0(ctx, a, b, s0, s1, p0, p1, d0, d1); // [N, OH, OW, IC * KH * KW]
+    result = ggml_conv_2d_stage_1(ctx, a, result);
+
+    return result;
+
+}
+
+// ggml_conv_2d_sk_p0
 struct ggml_tensor * ggml_conv_2d_sk_p0(
         struct ggml_context * ctx,
         struct ggml_tensor  * a,
@@ -8147,7 +5784,6 @@ static struct ggml_tensor * ggml_add_rel_pos_impl(
     return result;
 }
 
-
 struct ggml_tensor * ggml_add_rel_pos(
         struct ggml_context * ctx,
         struct ggml_tensor  * a,
@@ -8592,8 +6228,6 @@ struct ggml_tensor * ggml_map_custom3_inplace(
     return ggml_map_custom3_impl(ctx, a, b, c, fun, n_tasks, userdata, true);
 }
 
-
-
 // ggml_cross_entropy_loss
 
 struct ggml_tensor * ggml_cross_entropy_loss(
@@ -8647,6 +6281,7 @@ void ggml_set_param(
 
     GGML_ASSERT(tensor->grad == NULL);
     tensor->grad = ggml_dup_tensor(ctx, tensor);
+    ggml_format_name(tensor->grad, "%s (grad)", tensor->name);
 }
 
 // ggml_compute_forward_dup
@@ -9322,9 +6957,15 @@ static void ggml_compute_forward_add_f16_f32(
 
     GGML_ASSERT(src0->type == GGML_TYPE_F16);
     GGML_ASSERT(src1->type == GGML_TYPE_F32);
-    GGML_ASSERT(dst->type  == GGML_TYPE_F16);
 
-    GGML_ASSERT( nb0 == sizeof(ggml_fp16_t));
+    if (dst->type == GGML_TYPE_F32) {
+        GGML_ASSERT( nb0 == sizeof(float));
+    }
+    else {
+        GGML_ASSERT(dst->type  == GGML_TYPE_F16);
+        GGML_ASSERT( nb0 == sizeof(ggml_fp16_t));
+    }
+
     GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
 
     // rows per thread
@@ -9335,18 +6976,35 @@ static void ggml_compute_forward_add_f16_f32(
     const int ir1 = MIN(ir0 + dr, nr);
 
     if (nb10 == sizeof(float)) {
-        for (int ir = ir0; ir < ir1; ++ir) {
-            // src0, src1 and dst are same shape => same indices
-            const int i3 = ir/(ne2*ne1);
-            const int i2 = (ir - i3*ne2*ne1)/ne1;
-            const int i1 = (ir - i3*ne2*ne1 - i2*ne1);
+        if (dst->type == GGML_TYPE_F16) {
+            for (int ir = ir0; ir < ir1; ++ir) {
+                // src0, src1 and dst are same shape => same indices
+                const int i3 = ir/(ne2*ne1);
+                const int i2 = (ir - i3*ne2*ne1)/ne1;
+                const int i1 = (ir - i3*ne2*ne1 - i2*ne1);
 
-            ggml_fp16_t * dst_ptr  = (ggml_fp16_t *) ((char *) dst->data  + i3*nb3  + i2*nb2  + i1*nb1);
-            ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01);
-            float *       src1_ptr = (float *)       ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11);
+                ggml_fp16_t * dst_ptr  = (ggml_fp16_t *) ((char *) dst->data  + i3*nb3  + i2*nb2  + i1*nb1);
+                ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01);
+                float *       src1_ptr = (float *)       ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11);
 
-            for (int i = 0; i < ne0; i++) {
-                dst_ptr[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(src0_ptr[i]) + src1_ptr[i]);
+                for (int i = 0; i < ne0; i++) {
+                    dst_ptr[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(src0_ptr[i]) + src1_ptr[i]);
+                }
+            }
+        } else {
+            for (int ir = ir0; ir < ir1; ++ir) {
+                // src0, src1 and dst are same shape => same indices
+                const int i3 = ir/(ne2*ne1);
+                const int i2 = (ir - i3*ne2*ne1)/ne1;
+                const int i1 = (ir - i3*ne2*ne1 - i2*ne1);
+
+                float *       dst_ptr  = (float *)       ((char *) dst->data  + i3*nb3  + i2*nb2  + i1*nb1);
+                ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01);
+                float *       src1_ptr = (float *)       ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11);
+
+                for (int i = 0; i < ne0; i++) {
+                    dst_ptr[i] = GGML_FP16_TO_FP32(src0_ptr[i]) + src1_ptr[i];
+                }
             }
         }
     }
@@ -9794,7 +7452,6 @@ static void ggml_compute_forward_add1(
     }
 }
 
-
 // ggml_compute_forward_acc
 
 static void ggml_compute_forward_acc_f32(
@@ -9934,7 +7591,6 @@ static void ggml_compute_forward_sub_f32(
             const int i2 = (ir - i3*ne2*ne1)/ne1;
             const int i1 = (ir - i3*ne2*ne1 - i2*ne1);
 
-
 #ifdef GGML_USE_ACCELERATE
             vDSP_vsub(
                     (float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11), 1,
@@ -10115,7 +7771,6 @@ static void ggml_compute_forward_div_f32(
             const int i2 = (ir - i3*ne2*ne1)/ne1;
             const int i1 = (ir - i3*ne2*ne1 - i2*ne1);
 
-
 #ifdef GGML_USE_ACCELERATE
             UNUSED(ggml_vec_div_f32);
 
@@ -10253,7 +7908,6 @@ static void ggml_compute_forward_sqrt(
     }
 }
 
-
 // ggml_compute_forward_log
 
 static void ggml_compute_forward_log_f32(
@@ -12086,7 +9740,6 @@ static void ggml_compute_forward_out_prod_f32(
         }
     }
 
-
     //int64_t t1 = ggml_perf_time_us();
     //static int64_t acc = 0;
     //acc += t1 - t0;
@@ -12282,7 +9935,6 @@ static void ggml_compute_forward_scale_f32(
 
     const size_t nb1 = dst->nb[1];
 
-
     for (int i1 = ir0; i1 < ir1; i1++) {
         if (dst->data != src0->data) {
             // src0 is same shape as dst => same indices
@@ -12680,7 +10332,6 @@ static void ggml_compute_forward_get_rows_back_f32(
     }
 }
 
-
 static void ggml_compute_forward_get_rows_back(
         const struct ggml_compute_params * params,
         const struct ggml_tensor * src0,
@@ -12915,7 +10566,7 @@ static void ggml_compute_forward_soft_max_f32(
                 // const float val = (sp[i] == -INFINITY) ? 0.0 : exp(sp[i] - max);
                 ggml_fp16_t s = GGML_FP32_TO_FP16(sp[i] - max);
                 memcpy(&scvt, &s, sizeof(scvt));
-                const float val = GGML_FP16_TO_FP32(table_exp_f16[scvt]);
+                const float val = GGML_FP16_TO_FP32(ggml_table_exp_f16[scvt]);
                 sum += (ggml_float)val;
                 dp[i] = val;
             }
@@ -13280,6 +10931,45 @@ static void ggml_compute_forward_clamp(
 
 // ggml_compute_forward_rope
 
+static float rope_yarn_ramp(const float low, const float high, const int i0) {
+    const float y = (i0 / 2 - low) / MAX(0.001f, high - low);
+    return 1 - MIN(1, MAX(0, y));
+}
+
+// YaRN algorithm based on LlamaYaRNScaledRotaryEmbedding.py from https://github.com/jquesnelle/yarn
+// MIT licensed. Copyright (c) 2023 Jeffrey Quesnelle and Bowen Peng.
+static void rope_yarn(
+    float theta_extrap, float freq_scale, float corr_dims[2], int64_t i0, float ext_factor, float mscale,
+    float * cos_theta, float * sin_theta
+) {
+    // Get n-d rotational scaling corrected for extrapolation
+    float theta_interp = freq_scale * theta_extrap;
+    float theta = theta_interp;
+    if (ext_factor != 0.0f) {
+        float ramp_mix = rope_yarn_ramp(corr_dims[0], corr_dims[1], i0) * ext_factor;
+        theta = theta_interp * (1 - ramp_mix) + theta_extrap * ramp_mix;
+
+        // Get n-d magnitude scaling corrected for interpolation
+        mscale *= 1.0f + 0.1f * logf(1.0f / freq_scale);
+    }
+    *cos_theta = cosf(theta) * mscale;
+    *sin_theta = sinf(theta) * mscale;
+}
+
+// Apparently solving `n_rot = 2pi * x * base^((2 * max_pos_emb) / n_dims)` for x, we get
+// `corr_dim(n_rot) = n_dims * log(max_pos_emb / (n_rot * 2pi)) / (2 * log(base))`
+static float ggml_rope_yarn_corr_dim(int n_dims, int n_orig_ctx, float n_rot, float base) {
+    return n_dims * logf(n_orig_ctx / (n_rot * 2 * (float)M_PI)) / (2 * logf(base));
+}
+
+void ggml_rope_yarn_corr_dims(
+    int n_dims, int n_orig_ctx, float freq_base, float beta_fast, float beta_slow, float dims[2]
+) {
+    // start and end correction dims
+    dims[0] = MAX(0,         floorf(ggml_rope_yarn_corr_dim(n_dims, n_orig_ctx, beta_fast, freq_base)));
+    dims[1] = MIN(n_dims - 1, ceilf(ggml_rope_yarn_corr_dim(n_dims, n_orig_ctx, beta_slow, freq_base)));
+}
+
 static void ggml_compute_forward_rope_f32(
         const struct ggml_compute_params * params,
         const struct ggml_tensor * src0,
@@ -13289,21 +10979,26 @@ static void ggml_compute_forward_rope_f32(
         return;
     }
 
-    float freq_base;
-    float freq_scale;
+    float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
 
     // these two only relevant for xPos RoPE:
     float xpos_base;
     bool  xpos_down;
 
-    //const int n_past = ((int32_t *) dst->op_params)[0];
-    const int n_dims = ((int32_t *) dst->op_params)[1];
-    const int mode   = ((int32_t *) dst->op_params)[2];
-    const int n_ctx  = ((int32_t *) dst->op_params)[3];
-    memcpy(&freq_base,  (int32_t *) dst->op_params + 4, sizeof(float));
-    memcpy(&freq_scale, (int32_t *) dst->op_params + 5, sizeof(float));
-    memcpy(&xpos_base,  (int32_t *) dst->op_params + 6, sizeof(float));
-    memcpy(&xpos_down,  (int32_t *) dst->op_params + 7, sizeof(bool));
+    //const int n_past     = ((int32_t *) dst->op_params)[0];
+    const int n_dims     = ((int32_t *) dst->op_params)[1];
+    const int mode       = ((int32_t *) dst->op_params)[2];
+    const int n_ctx      = ((int32_t *) dst->op_params)[3];
+    const int n_orig_ctx = ((int32_t *) dst->op_params)[4];
+
+    memcpy(&freq_base,   (int32_t *) dst->op_params +  5, sizeof(float));
+    memcpy(&freq_scale,  (int32_t *) dst->op_params +  6, sizeof(float));
+    memcpy(&ext_factor,  (int32_t *) dst->op_params +  7, sizeof(float));
+    memcpy(&attn_factor, (int32_t *) dst->op_params +  8, sizeof(float));
+    memcpy(&beta_fast,   (int32_t *) dst->op_params +  9, sizeof(float));
+    memcpy(&beta_slow,   (int32_t *) dst->op_params + 10, sizeof(float));
+    memcpy(&xpos_base,   (int32_t *) dst->op_params + 11, sizeof(float));
+    memcpy(&xpos_down,   (int32_t *) dst->op_params + 12, sizeof(bool));
 
     GGML_TENSOR_UNARY_OP_LOCALS
 
@@ -13331,6 +11026,9 @@ static void ggml_compute_forward_rope_f32(
     int ir = 0;
 
     const float theta_scale = powf(freq_base, -2.0f/n_dims);
+    const float inv_ndims = -1.f/n_dims;
+    float corr_dims[2];
+    ggml_rope_yarn_corr_dims(n_dims, n_orig_ctx, freq_base, beta_fast, beta_slow, corr_dims);
 
     const bool is_neox = mode & 2;
     const bool is_glm  = mode & 4;
@@ -13344,18 +11042,18 @@ static void ggml_compute_forward_rope_f32(
                 if (ir++ < ir0) continue;
                 if (ir   > ir1) break;
 
-                float theta = freq_scale * (float)p;
+                float theta_base = (float)p;
 
                 if (is_glm) {
-                    theta = MIN(p, n_ctx - 2);
+                    theta_base = MIN(p, n_ctx - 2);
                     float block_theta = MAX(p - (n_ctx - 2), 0);
                     for (int64_t i0 = 0; i0 < ne0 / 4; i0++) {
-                        const float cos_theta = cosf(theta);
-                        const float sin_theta = sinf(theta);
+                        const float cos_theta = cosf(theta_base);
+                        const float sin_theta = sinf(theta_base);
                         const float cos_block_theta = cosf(block_theta);
                         const float sin_block_theta = sinf(block_theta);
 
-                        theta *= theta_scale;
+                        theta_base *= theta_scale;
                         block_theta *= theta_scale;
 
                         const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
@@ -13373,13 +11071,16 @@ static void ggml_compute_forward_rope_f32(
                     }
                 } else if (!is_neox) {
                     for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
-                        const float cos_theta = cosf(theta);
-                        const float sin_theta = sinf(theta);
+                        float cos_theta, sin_theta;
+                        rope_yarn(
+                            theta_base, freq_scale, corr_dims, i0, ext_factor, attn_factor, &cos_theta, &sin_theta
+                        );
+
                         // zeta scaling for xPos only:
                         float zeta = xpos_base != 0.0f ? powf((i0 + 0.4f * ne0) / (1.4f * ne0), p / xpos_base) : 1.0f;
                         if (xpos_down) zeta = 1.0f / zeta;
 
-                        theta *= theta_scale;
+                        theta_base *= theta_scale;
 
                         const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
                               float * dst_data  = (float *)((char *)  dst->data + i3*nb3  + i2*nb2  + i1*nb1  + i0*nb0);
@@ -13393,12 +11094,19 @@ static void ggml_compute_forward_rope_f32(
                 } else {
                     // TODO: this might be wrong for ne0 != n_dims - need double check
                     // ref:  https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt_neox/modeling_gpt_neox.py#LL251C1-L294C28
+                    theta_base *= freq_scale;
                     for (int64_t ib = 0; ib < ne0/n_dims; ++ib) {
                         for (int64_t ic = 0; ic < n_dims; ic += 2) {
-                            const float cos_theta = cosf(theta);
-                            const float sin_theta = sinf(theta);
+                            // simplified from `(ib * n_dims + ic) * inv_ndims`
+                            float cur_rot = inv_ndims * ic - ib;
 
-                            theta *= theta_scale;
+                            float cos_theta, sin_theta;
+                            rope_yarn(
+                                theta_base, freq_scale, corr_dims, cur_rot, ext_factor, attn_factor,
+                                &cos_theta, &sin_theta
+                            );
+
+                            theta_base *= theta_scale;
 
                             const int64_t i0 = ib*n_dims + ic/2;
 
@@ -13427,15 +11135,19 @@ static void ggml_compute_forward_rope_f16(
         return;
     }
 
-    float freq_base;
-    float freq_scale;
+    float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
 
-    //const int n_past = ((int32_t *) dst->op_params)[0];
-    const int n_dims = ((int32_t *) dst->op_params)[1];
-    const int mode   = ((int32_t *) dst->op_params)[2];
-    const int n_ctx  = ((int32_t *) dst->op_params)[3];
-    memcpy(&freq_base,  (int32_t *) dst->op_params + 4, sizeof(float));
-    memcpy(&freq_scale, (int32_t *) dst->op_params + 5, sizeof(float));
+    //const int n_past     = ((int32_t *) dst->op_params)[0];
+    const int n_dims     = ((int32_t *) dst->op_params)[1];
+    const int mode       = ((int32_t *) dst->op_params)[2];
+    const int n_ctx      = ((int32_t *) dst->op_params)[3];
+    const int n_orig_ctx = ((int32_t *) dst->op_params)[4];
+    memcpy(&freq_base,   (int32_t *) dst->op_params +  5, sizeof(float));
+    memcpy(&freq_scale,  (int32_t *) dst->op_params +  6, sizeof(float));
+    memcpy(&ext_factor,  (int32_t *) dst->op_params +  7, sizeof(float));
+    memcpy(&attn_factor, (int32_t *) dst->op_params +  8, sizeof(float));
+    memcpy(&beta_fast,   (int32_t *) dst->op_params +  9, sizeof(float));
+    memcpy(&beta_slow,   (int32_t *) dst->op_params + 10, sizeof(float));
 
     GGML_TENSOR_UNARY_OP_LOCALS
 
@@ -13463,6 +11175,9 @@ static void ggml_compute_forward_rope_f16(
     int ir = 0;
 
     const float theta_scale = powf(freq_base, -2.0f/n_dims);
+    const float inv_ndims = -1.f/n_dims;
+    float corr_dims[2];
+    ggml_rope_yarn_corr_dims(n_dims, n_orig_ctx, freq_base, beta_fast, beta_slow, corr_dims);
 
     const bool is_neox = mode & 2;
     const bool is_glm  = mode & 4;
@@ -13476,18 +11191,18 @@ static void ggml_compute_forward_rope_f16(
                 if (ir++ < ir0) continue;
                 if (ir   > ir1) break;
 
-                float theta = freq_scale * (float)p;
+                float theta_base = (float)p;
 
                 if (is_glm) {
-                    theta = MIN(p, n_ctx - 2);
+                    theta_base = MIN(p, n_ctx - 2);
                     float block_theta = MAX(p - (n_ctx - 2), 0);
                     for (int64_t i0 = 0; i0 < ne0 / 4; i0++) {
-                        const float cos_theta = cosf(theta);
-                        const float sin_theta = sinf(theta);
+                        const float cos_theta = cosf(theta_base);
+                        const float sin_theta = sinf(theta_base);
                         const float cos_block_theta = cosf(block_theta);
                         const float sin_block_theta = sinf(block_theta);
 
-                        theta *= theta_scale;
+                        theta_base *= theta_scale;
                         block_theta *= theta_scale;
 
                         const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
@@ -13503,12 +11218,14 @@ static void ggml_compute_forward_rope_f16(
                         dst_data[n_dims]     = GGML_FP32_TO_FP16(x2*cos_block_theta - x3*sin_block_theta);
                         dst_data[n_dims/2*3] = GGML_FP32_TO_FP16(x2*sin_block_theta + x3*cos_block_theta);
                     }
-                } if (!is_neox) {
+                } else if (!is_neox) {
                     for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
-                        const float cos_theta = cosf(theta);
-                        const float sin_theta = sinf(theta);
+                        float cos_theta, sin_theta;
+                        rope_yarn(
+                            theta_base, freq_scale, corr_dims, i0, ext_factor, attn_factor, &cos_theta, &sin_theta
+                        );
 
-                        theta *= theta_scale;
+                        theta_base *= theta_scale;
 
                         const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
                               ggml_fp16_t * dst_data  = (ggml_fp16_t *)((char *)  dst->data + i3*nb3  + i2*nb2  + i1*nb1  + i0*nb0);
@@ -13522,12 +11239,19 @@ static void ggml_compute_forward_rope_f16(
                 } else {
                     // TODO: this might be wrong for ne0 != n_dims - need double check
                     // ref:  https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt_neox/modeling_gpt_neox.py#LL251C1-L294C28
+                    theta_base *= freq_scale;
                     for (int64_t ib = 0; ib < ne0/n_dims; ++ib) {
                         for (int64_t ic = 0; ic < n_dims; ic += 2) {
-                            const float cos_theta = cosf(theta);
-                            const float sin_theta = sinf(theta);
+                            // simplified from `(ib * n_dims + ic) * inv_ndims`
+                            float cur_rot = inv_ndims * ic - ib;
 
-                            theta *= theta_scale;
+                            float cos_theta, sin_theta;
+                            rope_yarn(
+                                theta_base, freq_scale, corr_dims, cur_rot, ext_factor, attn_factor,
+                                &cos_theta, &sin_theta
+                            );
+
+                            theta_base *= theta_scale;
 
                             const int64_t i0 = ib*n_dims + ic/2;
 
@@ -13635,17 +11359,18 @@ static void ggml_compute_forward_rope_back_f32(
                 if (ir++ < ir0) continue;
                 if (ir   > ir1) break;
 
-                float theta = freq_scale * (float)p;
+                float theta_base = freq_scale * (float)p;
 
                 if (!is_neox) {
                     for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
-                        const float cos_theta = cosf(theta);
-                        const float sin_theta = sinf(theta);
+                        const float cos_theta = cosf(theta_base);
+                        const float sin_theta = sinf(theta_base);
+
                         // zeta scaling for xPos only:
                         float zeta = xpos_base != 0.0f ? powf((i0 + 0.4f * ne0) / (1.4f * ne0), p / xpos_base) : 1.0f;
                         if (xpos_down) zeta = 1.0f / zeta;
 
-                        theta *= theta_scale;
+                        theta_base *= theta_scale;
 
                         const float * const dy  = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
                               float *       dx  = (float *)((char *)  dst->data + i3*nb3  + i2*nb2  + i1*nb1  + i0*nb0);
@@ -13659,10 +11384,10 @@ static void ggml_compute_forward_rope_back_f32(
                 } else {
                     for (int64_t ib = 0; ib < ne0/n_dims; ++ib) {
                         for (int64_t ic = 0; ic < n_dims; ic += 2) {
-                            const float cos_theta = cosf(theta);
-                            const float sin_theta = sinf(theta);
+                            const float cos_theta = cosf(theta_base);
+                            const float sin_theta = sinf(theta_base);
 
-                            theta *= theta_scale;
+                            theta_base *= theta_scale;
 
                             const int64_t i0 = ib*n_dims + ic/2;
 
@@ -13735,14 +11460,14 @@ static void ggml_compute_forward_rope_back_f16(
                 if (ir++ < ir0) continue;
                 if (ir   > ir1) break;
 
-                float theta = (float)p;
+                float theta_base = (float)p;
 
                 if (!is_neox) {
                     for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
-                        const float cos_theta = cosf(theta);
-                        const float sin_theta = sinf(theta);
+                        const float cos_theta = cosf(theta_base);
+                        const float sin_theta = sinf(theta_base);
 
-                        theta *= theta_scale;
+                        theta_base *= theta_scale;
 
                         const ggml_fp16_t * const dy  = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
                               ggml_fp16_t *       dx  = (ggml_fp16_t *)((char *)  dst->data + i3*nb3  + i2*nb2  + i1*nb1  + i0*nb0);
@@ -13756,10 +11481,10 @@ static void ggml_compute_forward_rope_back_f16(
                 } else {
                     for (int64_t ib = 0; ib < ne0/n_dims; ++ib) {
                         for (int64_t ic = 0; ic < n_dims; ic += 2) {
-                            const float cos_theta = cosf(theta);
-                            const float sin_theta = sinf(theta);
+                            const float cos_theta = cosf(theta_base);
+                            const float sin_theta = sinf(theta_base);
 
-                            theta *= theta_scale;
+                            theta_base *= theta_scale;
 
                             const int64_t i0 = ib*n_dims + ic/2;
 
@@ -13963,6 +11688,7 @@ static void ggml_compute_forward_conv_1d_f32(
     }
 }
 
+// TODO: reuse ggml_mul_mat or implement ggml_im2col and remove stage_0 and stage_1
 static void gemm_f16_out_f32(int64_t m, int64_t n, int64_t k,
                              ggml_fp16_t * A,
                              ggml_fp16_t * B,
@@ -14264,6 +11990,9 @@ static void ggml_compute_forward_conv_transpose_1d_f16_f32(
             }
         }
 
+        // need to zero dst since we are accumulating into it
+        memset(dst->data, 0, ggml_nbytes(dst));
+
         return;
     }
 
@@ -14336,7 +12065,7 @@ static void ggml_compute_forward_conv_transpose_1d_f32(
                     const float * const src = (float *)((char *) src0->data + i02*nb02 + i01*nb01);
                     float * dst_data = wdata + i01*ne00*ne02;
                     for (int64_t i00 = 0; i00 < ne00; i00++) {
-                        dst_data[i01*ne00*ne02 + i00*ne02 + i02] = src[i00];
+                        dst_data[i00*ne02 + i02] = src[i00];
                     }
                 }
             }
@@ -14355,6 +12084,9 @@ static void ggml_compute_forward_conv_transpose_1d_f32(
             }
         }
 
+        // need to zero dst since we are accumulating into it
+        memset(dst->data, 0, ggml_nbytes(dst));
+
         return;
     }
 
@@ -14416,6 +12148,144 @@ static void ggml_compute_forward_conv_transpose_1d(
 
 // ggml_compute_forward_conv_2d
 
+// src0: kernel [OC, IC, KH, KW]
+// src1: image [N, IC, IH, IW]
+// dst:  result [N, OH, OW, IC*KH*KW]
+static void ggml_compute_forward_conv_2d_stage_0_f32(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * src0,
+        const struct ggml_tensor * src1,
+              struct ggml_tensor * dst) {
+    GGML_ASSERT(src0->type == GGML_TYPE_F16);
+    GGML_ASSERT(src1->type == GGML_TYPE_F32);
+    GGML_ASSERT( dst->type == GGML_TYPE_F16);
+
+    int64_t t0 = ggml_perf_time_us();
+    UNUSED(t0);
+
+    GGML_TENSOR_BINARY_OP_LOCALS;
+
+    const int64_t N = ne13;
+    const int64_t IC = ne12;
+    const int64_t IH = ne11;
+    const int64_t IW = ne10;
+
+    // const int64_t OC = ne03;
+    // const int64_t IC = ne02;
+    const int64_t KH = ne01;
+    const int64_t KW = ne00;
+
+    const int64_t OH = ne2;
+    const int64_t OW = ne1;
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
+    const int32_t s1 = ((const int32_t*)(dst->op_params))[1];
+    const int32_t p0 = ((const int32_t*)(dst->op_params))[2];
+    const int32_t p1 = ((const int32_t*)(dst->op_params))[3];
+    const int32_t d0 = ((const int32_t*)(dst->op_params))[4];
+    const int32_t d1 = ((const int32_t*)(dst->op_params))[5];
+
+    GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
+    GGML_ASSERT(nb10 == sizeof(float));
+
+    if (params->type == GGML_TASK_INIT) {
+        memset(dst->data, 0, ggml_nbytes(dst));
+        return;
+    }
+
+    if (params->type == GGML_TASK_FINALIZE) {
+        return;
+    }
+
+    // im2col: [N, IC, IH, IW] => [N, OH, OW, IC*KH*KW]
+    {
+        ggml_fp16_t * const wdata = (ggml_fp16_t *) dst->data;
+
+        for (int64_t in = 0; in < N; in++) {
+            for (int64_t ioh = 0; ioh < OH; ioh++) {
+                for (int64_t iow = 0; iow < OW; iow++) {
+                    for (int64_t iic = ith; iic < IC; iic+=nth) {
+
+                        // micro kernel
+                        ggml_fp16_t * dst_data = wdata + (in*OH*OW + ioh*OW + iow)*(IC*KH*KW); // [IC, KH, KW]
+                        const float * const src_data = (float *)((char *) src1->data + in*nb13 + iic*nb12); // [IH, IW]
+
+                        for (int64_t ikh = 0; ikh < KH; ikh++) {
+                            for (int64_t ikw = 0; ikw < KW; ikw++) {
+                                const int64_t iiw = iow*s0 + ikw*d0 - p0;
+                                const int64_t iih = ioh*s1 + ikh*d1 - p1;
+
+                                if (!(iih < 0 || iih >= IH || iiw < 0 || iiw >= IW)) {
+                                    dst_data[iic*(KH*KW) + ikh*KW + ikw] = GGML_FP32_TO_FP16(src_data[iih*IW + iiw]);
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    }
+}
+
+// gemm: [N, OC, OH, OW] = [OC, IC * KH * KW] x [N*OH*OW, IC * KH * KW]
+// src0: [OC, IC, KH, KW]
+// src1: [N, OH, OW, IC * KH * KW]
+// result: [N, OC, OH, OW]
+static void ggml_compute_forward_conv_2d_stage_1_f16(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * src0,
+        const struct ggml_tensor * src1,
+              struct ggml_tensor * dst) {
+    GGML_ASSERT(src0->type == GGML_TYPE_F16);
+    GGML_ASSERT(src1->type == GGML_TYPE_F16);
+    GGML_ASSERT( dst->type == GGML_TYPE_F32);
+
+    int64_t t0 = ggml_perf_time_us();
+    UNUSED(t0);
+
+    if (params->type == GGML_TASK_INIT) {
+        return;
+    }
+
+    if (params->type == GGML_TASK_FINALIZE) {
+        return;
+    }
+
+    GGML_TENSOR_BINARY_OP_LOCALS;
+
+    GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
+    GGML_ASSERT(nb10 == sizeof(ggml_fp16_t));
+    GGML_ASSERT(nb0  == sizeof(float));
+
+    const int N = ne13;
+    const int OH = ne12;
+    const int OW = ne11;
+
+    const int OC = ne03;
+    const int IC = ne02;
+    const int KH = ne01;
+    const int KW = ne00;
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    int64_t m = OC;
+    int64_t n = OH * OW;
+    int64_t k = IC * KH * KW;
+
+    // [N, OC, OH, OW] = [OC, IC * KH * KW] x [N*OH*OW, IC * KH * KW]
+    for (int i = 0; i < N; i++) {
+        ggml_fp16_t * A = (ggml_fp16_t *)src0->data; // [m, k]
+        ggml_fp16_t * B = (ggml_fp16_t *)src1->data + i * m * k; // [n, k]
+        float * C = (float *)dst->data + i * m * n; // [m, n]
+
+        gemm_f16_out_f32(m, n, k, A, B, C, ith, nth);
+    }
+}
+
 static void ggml_compute_forward_conv_2d_f16_f32(
         const struct ggml_compute_params * params,
         const struct ggml_tensor * src0,
@@ -14430,14 +12300,38 @@ static void ggml_compute_forward_conv_2d_f16_f32(
 
     GGML_TENSOR_BINARY_OP_LOCALS
 
+    // src1: image [N, IC, IH, IW]
+    // src0: kernel [OC, IC, KH, KW]
+    // dst:  result [N, OC, OH, OW]
+    // ne12: IC
+    // ne0: OW
+    // ne1: OH
+    // nk0: KW
+    // nk1: KH
+    // ne13: N
+
+    const int N = ne13;
+    const int IC = ne12;
+    const int IH = ne11;
+    const int IW = ne10;
+
+    const int OC = ne03;
+    // const int IC = ne02;
+    const int KH = ne01;
+    const int KW = ne00;
+
+    const int OH = ne1;
+    const int OW = ne0;
+
     const int ith = params->ith;
     const int nth = params->nth;
 
-    const int nk0 = ne00;
-    const int nk1 = ne01;
+    // const int nk0 = ne00;
+    // const int nk1 = ne01;
 
     // size of the convolution row - the kernel size unrolled across all channels
-    const int ew0 = nk0*nk1*ne02;
+    // const int ew0 = nk0*nk1*ne02;
+    // ew0: IC*KH*KW
 
     const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
     const int32_t s1 = ((const int32_t*)(dst->op_params))[1];
@@ -14453,24 +12347,27 @@ static void ggml_compute_forward_conv_2d_f16_f32(
         memset(params->wdata, 0, params->wsize);
 
         // prepare source data (src1)
+        // im2col: [N, IC, IH, IW] => [N*OH*OW, IC*KH*KW]
+
         {
             ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0;
 
-            for (int i13 = 0; i13 < ne13; i13++) {
-                for (int i12 = 0; i12 < ne12; i12++) {
-                    const float * const src = (float *)((char *) src1->data + i13*nb13 + i12*nb12);
-                    ggml_fp16_t * dst_data = wdata + i13*(ne1*ne0*ew0);
+            for (int in = 0; in < N; in++) {
+                for (int iic = 0; iic < IC; iic++) {
+                    for (int ioh = 0; ioh < OH; ioh++) {
+                        for (int iow = 0; iow < OW; iow++) {
 
-                    for (int i1 = 0; i1 < ne1; i1++) {
-                        for (int i0 = 0; i0 < ne0; i0++) {
-                            for (int ik1 = 0; ik1 < nk1; ik1++) {
-                                for (int ik0 = 0; ik0 < nk0; ik0++) {
-                                    const int idx0 = i0*s0 + ik0*d0 - p0;
-                                    const int idx1 = i1*s1 + ik1*d1 - p1;
+                            // micro kernel
+                            ggml_fp16_t * dst_data = wdata + (in*OH*OW + ioh*OW + iow)*(IC*KH*KW); // [IC, KH, KW]
+                            const float * const src_data = (float *)((char *) src1->data + in*nb13 + iic*nb12); // [IH, IW]
 
-                                    if (!(idx1 < 0 || idx1 >= ne11 || idx0 < 0 || idx0 >= ne10)) {
-                                        dst_data[(i1*ne0 + i0)*ew0 + i12*(nk0*nk1) + ik1*nk0 + ik0] =
-                                            GGML_FP32_TO_FP16(src[idx1*ne10 + idx0]);
+                            for (int ikh = 0; ikh < KH; ikh++) {
+                                for (int ikw = 0; ikw < KW; ikw++) {
+                                    const int iiw = iow*s0 + ikw*d0 - p0;
+                                    const int iih = ioh*s1 + ikh*d1 - p1;
+
+                                    if (!(iih < 0 || iih >= IH || iiw < 0 || iiw >= IW)) {
+                                        dst_data[iic*(KH*KW) + ikh*KW + ikw] = GGML_FP32_TO_FP16(src_data[iih*IW + iiw]);
                                     }
                                 }
                             }
@@ -14487,30 +12384,22 @@ static void ggml_compute_forward_conv_2d_f16_f32(
         return;
     }
 
-    // total patches in dst
-    const int np = ne2;
-
-    // patches per thread
-    const int dp = (np + nth - 1)/nth;
-
-    // patch range for this thread
-    const int ip0 = dp*ith;
-    const int ip1 = MIN(ip0 + dp, np);
-
     ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0;
+    // wdata: [N*OH*OW, IC*KH*KW]
+    // dst: result [N, OC, OH, OW]
+    // src0: kernel [OC, IC, KH, KW]
 
-    for (int i3 = 0; i3 < ne3; i3++) {
-        for (int i2 = ip0; i2 < ip1; i2++) {
-            float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2);
+    int64_t m = OC;
+    int64_t n = OH * OW;
+    int64_t k = IC * KH * KW;
 
-            for (int i1 = 0; i1 < ne1; ++i1) {
-                for (int i0 = 0; i0 < ne0; ++i0) {
-                    ggml_vec_dot_f16(ew0, dst_data + i1*ne0 + i0,
-                            (ggml_fp16_t *) ((char *) src0->data + i2*nb03),
-                            (ggml_fp16_t *)                wdata + i3*nb3 + (i1*ne0 + i0)*ew0);
-                }
-            }
-        }
+    // [N, OC, OH, OW] = [OC, IC * KH * KW] x [N*OH*OW, IC * KH * KW]
+    for (int i = 0; i < N; i++) {
+        ggml_fp16_t * A = (ggml_fp16_t *)src0->data; // [m, k]
+        ggml_fp16_t * B = (ggml_fp16_t *)wdata + i * m * k; // [n, k]
+        float * C = (float *)dst->data + i * m * n; // [m * k]
+
+        gemm_f16_out_f32(m, n, k, A, B, C, ith, nth);
     }
 }
 
@@ -14536,6 +12425,48 @@ static void ggml_compute_forward_conv_2d(
     }
 }
 
+static void ggml_compute_forward_conv_2d_stage_0(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * src0,
+        const struct ggml_tensor * src1,
+              struct ggml_tensor * dst) {
+    switch (src0->type) {
+        case GGML_TYPE_F16:
+            {
+                ggml_compute_forward_conv_2d_stage_0_f32(params, src0, src1, dst);
+            } break;
+        case GGML_TYPE_F32:
+            {
+                GGML_ASSERT(false);
+            } break;
+        default:
+            {
+                GGML_ASSERT(false);
+            } break;
+    }
+}
+
+static void ggml_compute_forward_conv_2d_stage_1(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * src0,
+        const struct ggml_tensor * src1,
+              struct ggml_tensor * dst) {
+    switch (src0->type) {
+        case GGML_TYPE_F16:
+            {
+                ggml_compute_forward_conv_2d_stage_1_f16(params, src0, src1, dst);
+            } break;
+        case GGML_TYPE_F32:
+            {
+                GGML_ASSERT(false);
+            } break;
+        default:
+            {
+                GGML_ASSERT(false);
+            } break;
+    }
+}
+
 // ggml_compute_forward_conv_transpose_2d
 
 static void ggml_compute_forward_conv_transpose_2d(
@@ -14594,6 +12525,8 @@ static void ggml_compute_forward_conv_transpose_2d(
             }
         }
 
+        memset(dst->data, 0, ggml_nbytes(dst));
+
         return;
     }
 
@@ -14996,7 +12929,7 @@ static void ggml_compute_forward_flash_attn_f32(
 #else
                             ggml_fp16_t s = GGML_FP32_TO_FP16(SS[j] - max);
                             memcpy(&scvt[j], &s, sizeof(uint16_t));
-                            const float val = GGML_FP16_TO_FP32(table_exp_f16[scvt[j]]);
+                            const float val = GGML_FP16_TO_FP32(ggml_table_exp_f16[scvt[j]]);
 #endif
                             sump[j] += (ggml_float)val;
                             SS[j] = val;
@@ -15198,7 +13131,7 @@ static void ggml_compute_forward_flash_attn_f16(
                         } else {
                             ggml_fp16_t s = GGML_FP32_TO_FP16(SS[j] - max);
                             memcpy(&scvt[j], &s, sizeof(uint16_t));
-                            const float val = GGML_FP16_TO_FP32(table_exp_f16[scvt[j]]);
+                            const float val = GGML_FP16_TO_FP32(ggml_table_exp_f16[scvt[j]]);
                             sump[j] += (ggml_float)val;
                             SS[j] = val;
                         }
@@ -15649,7 +13582,7 @@ static void ggml_compute_forward_flash_attn_back_f32(
 #else
                                     ggml_fp16_t s = GGML_FP32_TO_FP16(SR[j] - max);
                                     memcpy(&scvt[j], &s, sizeof(uint16_t));
-                                    const float val = GGML_FP16_TO_FP32(table_exp_f16[scvt[j]]);
+                                    const float val = GGML_FP16_TO_FP32(ggml_table_exp_f16[scvt[j]]);
 #endif
                                     sump[j] += (ggml_float)val;
                                     SW[j] = val;
@@ -16092,7 +14025,6 @@ static void ggml_compute_forward_add_rel_pos_f32(
     const int ip0 = dp*ith;
     const int ip1 = MIN(ip0 + dp, np);
 
-
     for (int64_t i13 = ip0; i13 < ip1; ++i13) {
         for (int64_t i12 = 0; i12 < ne12; ++i12) {
             for (int64_t i11 = 0; i11 < ne11; ++i11) {
@@ -16159,7 +14091,6 @@ static void ggml_compute_forward_map_unary_f32(
     }
 }
 
-
 static void ggml_compute_forward_map_unary(
         const struct ggml_compute_params * params,
         const struct ggml_tensor * src0,
@@ -16207,7 +14138,6 @@ static void ggml_compute_forward_map_binary_f32(
     }
 }
 
-
 static void ggml_compute_forward_map_binary(
         const struct ggml_compute_params * params,
         const struct ggml_tensor * src0,
@@ -16259,7 +14189,6 @@ static void ggml_compute_forward_map_custom2_f32(
     fun(dst, a, b);
 }
 
-
 // ggml_compute_forward_map_custom3
 
 static void ggml_compute_forward_map_custom3_f32(
@@ -16403,7 +14332,7 @@ static void ggml_compute_forward_cross_entropy_loss_f32(
 #else
                     ggml_fp16_t s = GGML_FP32_TO_FP16(s0[i] - max);
                     memcpy(&scvt, &s, sizeof(scvt));
-                    const float val = GGML_FP16_TO_FP32(table_exp_f16[scvt]);
+                    const float val = GGML_FP16_TO_FP32(ggml_table_exp_f16[scvt]);
 #endif
                     sum += (ggml_float)val;
                     st[i] = val;
@@ -16517,7 +14446,7 @@ static void ggml_compute_forward_cross_entropy_loss_back_f32(
 #else
                     ggml_fp16_t s = GGML_FP32_TO_FP16(s0[i] - max);
                     memcpy(&scvt, &s, sizeof(scvt));
-                    const float val = GGML_FP16_TO_FP32(table_exp_f16[scvt]);
+                    const float val = GGML_FP16_TO_FP32(ggml_table_exp_f16[scvt]);
 #endif
                     sum += (ggml_float)val;
                     ds0[i] = val;
@@ -16534,7 +14463,6 @@ static void ggml_compute_forward_cross_entropy_loss_back_f32(
         ggml_vec_sub_f32(nc, ds0, ds0, s1);
         ggml_vec_scale_f32(nc, ds0, d[0] / (float) nr);
 
-
 #ifndef NDEBUG
         for (int i = 0; i < nc; ++i) {
             assert(!isnan(ds0[i]));
@@ -16562,12 +14490,15 @@ static void ggml_compute_forward_cross_entropy_loss_back(
     }
 }
 
-
 /////////////////////////////////
 
 static void ggml_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) {
     GGML_ASSERT(params);
 
+    if (tensor->op == GGML_OP_NONE) {
+        return;
+    }
+
 #ifdef GGML_USE_CUBLAS
     bool skip_cpu = ggml_cuda_compute_forward(params, tensor);
     if (skip_cpu) {
@@ -16770,6 +14701,14 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
             {
                 ggml_compute_forward_conv_2d(params, tensor->src[0], tensor->src[1], tensor);
             } break;
+        case GGML_OP_CONV_2D_STAGE_0:
+            {
+                ggml_compute_forward_conv_2d_stage_0(params, tensor->src[0], tensor->src[1], tensor);
+            } break;
+        case GGML_OP_CONV_2D_STAGE_1:
+            {
+                ggml_compute_forward_conv_2d_stage_1(params, tensor->src[0], tensor->src[1], tensor);
+            } break;
         case GGML_OP_CONV_TRANSPOSE_2D:
             {
                 ggml_compute_forward_conv_transpose_2d(params, tensor->src[0], tensor->src[1], tensor);
@@ -17670,9 +15609,14 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
                                 src1,
                                 n_dims,
                                 mode,
+                                0,
                                 n_ctx,
                                 freq_base,
                                 freq_scale,
+                                0.0f,
+                                1.0f,
+                                0.0f,
+                                0.0f,
                                 xpos_base,
                                 xpos_down,
                                 false),
@@ -17699,11 +15643,19 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
             {
                 GGML_ASSERT(false); // TODO: not implemented
             } break;
+        case GGML_OP_CONV_TRANSPOSE_1D:
+            {
+                GGML_ASSERT(false); // TODO: not implemented
+            } break;
         case GGML_OP_CONV_2D:
             {
                 GGML_ASSERT(false); // TODO: not implemented
             } break;
-        case GGML_OP_CONV_TRANSPOSE_1D:
+        case GGML_OP_CONV_2D_STAGE_0:
+            {
+                GGML_ASSERT(false); // TODO: not implemented
+            } break;
+        case GGML_OP_CONV_2D_STAGE_1:
             {
                 GGML_ASSERT(false); // TODO: not implemented
             } break;
@@ -18632,6 +16584,7 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
                     const int64_t ne0 = node->ne[0];
                     const int64_t ne1 = node->ne[1];
                     const int64_t ne2 = node->ne[2];
+                    const int64_t ne3 = node->ne[3];
                     const int64_t nk = ne00*ne01;
                     const int64_t ew0 = nk * ne02;
 
@@ -18642,7 +16595,8 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
 
                     if (node->src[0]->type == GGML_TYPE_F16 &&
                         node->src[1]->type == GGML_TYPE_F32) {
-                        cur = sizeof(ggml_fp16_t)*(ne0*ne1*ew0);
+                        // im2col: [N*OH*OW, IC*KH*KW]
+                        cur = sizeof(ggml_fp16_t)*(ne3*ne0*ne1*ew0);
                     } else if (node->src[0]->type == GGML_TYPE_F32 &&
                                node->src[1]->type == GGML_TYPE_F32) {
                         cur = sizeof(float)*      (ne10*ne11*ne12);
@@ -18652,6 +16606,14 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
 
                     work_size = MAX(work_size, cur);
                 } break;
+            case GGML_OP_CONV_2D_STAGE_0:
+                {
+                    n_tasks = n_threads;
+                } break;
+            case GGML_OP_CONV_2D_STAGE_1:
+                {
+                    n_tasks = n_threads;
+                } break;
             case GGML_OP_CONV_TRANSPOSE_2D:
                 {
                     n_tasks = n_threads;
@@ -19136,6 +17098,7 @@ void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) {
 
                             if (idx == -1) {
                                 fprintf(stderr, "%s: failed to find tensor, arg = %d, node = %d\n", __func__, j, i);
+                                fclose(fout);
                                 return;
                             }
 
@@ -19839,7 +17802,6 @@ static enum ggml_opt_result ggml_opt_adam(
 
         opt->loss_after = fx;
 
-
         // check convergence
         if (fabsf(fx - fx_prev[0])/fx < params.adam.eps_f) {
             GGML_PRINT_DEBUG("converged\n");
@@ -20688,7 +18650,6 @@ size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, i
                 block_q8_0 * block = (block_q8_0*)dst + start / QK8_0;
                 result = ggml_quantize_q8_0(src + start, block, n, n, hist);
             } break;
-#ifdef GGML_USE_K_QUANTS
         case GGML_TYPE_Q2_K:
             {
                 GGML_ASSERT(start % QK_K == 0);
@@ -20719,7 +18680,6 @@ size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, i
                 block_q6_K * block = (block_q6_K*)dst + start / QK_K;
                 result = ggml_quantize_q6_K(src + start, block, n, n, hist);
             } break;
-#endif
         case GGML_TYPE_F16:
             {
                 int elemsize = sizeof(ggml_fp16_t);
@@ -20810,7 +18770,7 @@ struct gguf_kv {
 };
 
 struct gguf_header {
-    uint32_t magic;
+    char magic[4];
     uint32_t version;
     uint64_t n_tensors; // GGUFv2
     uint64_t n_kv;      // GGUFv2
@@ -20851,8 +18811,7 @@ static bool gguf_fread_el(FILE * file, void * dst, size_t size, size_t * offset)
     return n == size;
 }
 
-// NOTE: temporary handling of GGUFv1 >> remove after Oct 2023
-static bool gguf_fread_str_cur(FILE * file, struct gguf_str * p, size_t * offset) {
+static bool gguf_fread_str(FILE * file, struct gguf_str * p, size_t * offset) {
     p->n    = 0;
     p->data = NULL;
 
@@ -20864,23 +18823,10 @@ static bool gguf_fread_str_cur(FILE * file, struct gguf_str * p, size_t * offset
     return ok;
 }
 
-static bool gguf_fread_str_v1(FILE * file, struct gguf_str * p, size_t * offset) {
-    p->n    = 0;
-    p->data = NULL;
-
-    bool ok = true;
-
-    uint32_t n = 0;
-    ok = ok && gguf_fread_el(file, &n,       sizeof(n), offset); p->data = calloc(n + 1, 1); p->n = n;
-    ok = ok && gguf_fread_el(file,  p->data, p->n,      offset);
-
-    return ok;
-}
-
 struct gguf_context * gguf_init_empty(void) {
     struct gguf_context * ctx = GGML_ALIGNED_MALLOC(sizeof(struct gguf_context));
 
-    ctx->header.magic     = GGUF_MAGIC;
+    memcpy(ctx->header.magic, GGUF_MAGIC, sizeof(ctx->header.magic));
     ctx->header.version   = GGUF_VERSION;
     ctx->header.n_tensors = 0;
     ctx->header.n_kv      = 0;
@@ -20906,16 +18852,18 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
     // offset from start of file
     size_t offset = 0;
 
-    uint32_t magic = 0;
+    char magic[4];
 
     // check the magic before making allocations
     {
         gguf_fread_el(file, &magic, sizeof(magic), &offset);
 
-        if (magic != GGUF_MAGIC) {
-            fprintf(stderr, "%s: invalid magic number %08x\n", __func__, magic);
-            fclose(file);
-            return NULL;
+        for (uint32_t i = 0; i < sizeof(magic); i++) {
+            if (magic[i] != GGUF_MAGIC[i]) {
+                fprintf(stderr, "%s: invalid magic characters %s.\n", __func__, magic);
+                fclose(file);
+                return NULL;
+            }
         }
     }
 
@@ -20925,27 +18873,22 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
 
     // read the header
     {
-        ctx->header.magic = magic;
+        strncpy(ctx->header.magic, magic, 4);
+
 
         ctx->kv    = NULL;
         ctx->infos = NULL;
         ctx->data  = NULL;
 
         ok = ok && gguf_fread_el(file, &ctx->header.version,   sizeof(ctx->header.version),   &offset);
+        ok = ok && gguf_fread_el(file, &ctx->header.n_tensors, sizeof(ctx->header.n_tensors), &offset);
+        ok = ok && gguf_fread_el(file, &ctx->header.n_kv,      sizeof(ctx->header.n_kv),      &offset);
 
         if (ctx->header.version == 1) {
-            // NOTE: temporary handling of GGUFv1 >> remove after Oct 2023
-            uint32_t n_tensors = 0;
-            uint32_t n_kv      = 0;
-
-            ok = ok && gguf_fread_el(file, &n_tensors, sizeof(n_tensors), &offset);
-            ok = ok && gguf_fread_el(file, &n_kv,      sizeof(n_kv),      &offset);
-
-            ctx->header.n_tensors = n_tensors;
-            ctx->header.n_kv      = n_kv;
-        } else {
-            ok = ok && gguf_fread_el(file, &ctx->header.n_tensors, sizeof(ctx->header.n_tensors), &offset);
-            ok = ok && gguf_fread_el(file, &ctx->header.n_kv,      sizeof(ctx->header.n_kv),      &offset);
+            fprintf(stderr, "%s: GGUFv1 is no longer supported. please use a more up-to-date version\n", __func__);
+            fclose(file);
+            gguf_free(ctx);
+            return NULL;
         }
 
         if (!ok) {
@@ -20956,12 +18899,6 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
         }
     }
 
-    // NOTE: temporary handling of GGUFv1 >> remove after Oct 2023
-    bool (* gguf_fread_str)(FILE *, struct gguf_str *, size_t *) = gguf_fread_str_cur;
-    if (ctx->header.version == 1) {
-        gguf_fread_str = gguf_fread_str_v1;
-    }
-
     // read the kv pairs
     {
         ctx->kv = malloc(ctx->header.n_kv * sizeof(struct gguf_kv));
@@ -20992,15 +18929,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
                 case GGUF_TYPE_ARRAY:
                     {
                         ok = ok && gguf_fread_el(file, &kv->value.arr.type, sizeof(kv->value.arr.type), &offset);
-
-                        if (ctx->header.version == 1) {
-                            // NOTE: temporary handling of GGUFv1 >> remove after Oct 2023
-                            uint32_t n = 0;
-                            ok = ok && gguf_fread_el(file, &n, sizeof(n), &offset);
-                            kv->value.arr.n = n;
-                        } else {
-                            ok = ok && gguf_fread_el(file, &kv->value.arr.n, sizeof(kv->value.arr.n), &offset);
-                        }
+                        ok = ok && gguf_fread_el(file, &kv->value.arr.n,    sizeof(kv->value.arr.n), &offset);
 
                         switch (kv->value.arr.type) {
                             case GGUF_TYPE_UINT8:
@@ -21059,14 +18988,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
             ok = ok && gguf_fread_str(file, &info->name,                          &offset);
             ok = ok && gguf_fread_el (file, &info->n_dims, sizeof(info->n_dims),  &offset);
             for (uint32_t j = 0; j < info->n_dims; ++j) {
-                if (ctx->header.version == 1) {
-                    // NOTE: temporary handling of GGUFv1 >> remove after Oct 2023
-                    uint32_t t = 0;
-                    ok = ok && gguf_fread_el(file, &t, sizeof(t), &offset);
-                    info->ne[j] = t;
-                } else {
-                    ok = ok && gguf_fread_el(file, &info->ne[j], sizeof(info->ne[j]), &offset);
-                }
+                ok = ok && gguf_fread_el(file, &info->ne[j], sizeof(info->ne[j]), &offset);
             }
             ok = ok && gguf_fread_el (file, &info->type,   sizeof(info->type),    &offset);
             ok = ok && gguf_fread_el (file, &info->offset, sizeof(info->offset),  &offset);
diff --git a/ggml.h b/ggml.h
index 3eddc44b9..70eb25a6b 100644
--- a/ggml.h
+++ b/ggml.h
@@ -219,7 +219,7 @@
 #define GGML_MAX_CONTEXTS      64
 #define GGML_MAX_SRC           6
 #define GGML_MAX_NAME          64
-#define GGML_MAX_OP_PARAMS     32
+#define GGML_MAX_OP_PARAMS     64
 #define GGML_DEFAULT_N_THREADS 4
 
 #if UINTPTR_MAX == 0xFFFFFFFF
@@ -231,8 +231,9 @@
 #define GGML_EXIT_SUCCESS 0
 #define GGML_EXIT_ABORTED 1
 
-#define GGUF_MAGIC   0x46554747 // "GGUF"
-#define GGUF_VERSION 2
+#define GGUF_MAGIC "GGUF"
+
+#define GGUF_VERSION 3
 
 #define GGUF_DEFAULT_ALIGNMENT 32
 
@@ -400,15 +401,16 @@ extern "C" {
         GGML_OP_ALIBI,
         GGML_OP_CLAMP,
         GGML_OP_CONV_1D,
-        GGML_OP_CONV_2D,
+        GGML_OP_CONV_1D_STAGE_0,  // internal
+        GGML_OP_CONV_1D_STAGE_1,  // internal
         GGML_OP_CONV_TRANSPOSE_1D,
+        GGML_OP_CONV_2D,
+        GGML_OP_CONV_2D_STAGE_0, // internal
+        GGML_OP_CONV_2D_STAGE_1, // internal
         GGML_OP_CONV_TRANSPOSE_2D,
         GGML_OP_POOL_1D,
         GGML_OP_POOL_2D,
 
-        GGML_OP_CONV_1D_STAGE_0,  // internal
-        GGML_OP_CONV_1D_STAGE_1,  // internal
-
         GGML_OP_UPSCALE, // nearest interpolate
 
         GGML_OP_FLASH_ATTN,
@@ -704,7 +706,10 @@ extern "C" {
     GGML_API struct ggml_tensor * ggml_dup_tensor (struct ggml_context * ctx, const struct ggml_tensor * src);
     GGML_API struct ggml_tensor * ggml_view_tensor(struct ggml_context * ctx, struct ggml_tensor * src);
 
-    GGML_API struct ggml_tensor * ggml_get_tensor(struct ggml_context * ctx, const char * name);
+    // Context tensor enumeration and lookup
+    GGML_API struct ggml_tensor * ggml_get_first_tensor(struct ggml_context * ctx);
+    GGML_API struct ggml_tensor * ggml_get_next_tensor (struct ggml_context * ctx, struct ggml_tensor * tensor);
+    GGML_API struct ggml_tensor * ggml_get_tensor      (struct ggml_context * ctx, const char * name);
 
     GGML_API struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor);
     GGML_API struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value);
@@ -1016,9 +1021,9 @@ extern "C" {
             struct ggml_tensor  * b,
             float                 eps);
 
-    // A: n columns, m rows
-    // B: n columns, p rows  (i.e. we transpose it internally)
-    // result is m columns, p rows
+    // A: k columns, n rows => [ne03, ne02, n, k]
+    // B: k columns, m rows  (i.e. we transpose it internally) => [ne03 * x, ne02 * y, m, k]
+    // result is n columns, m rows => [ne03 * x, ne02 * y, m, n]
     GGML_API struct ggml_tensor * ggml_mul_mat(
             struct ggml_context * ctx,
             struct ggml_tensor  * a,
@@ -1321,8 +1326,13 @@ extern "C" {
             int                   n_dims,
             int                   mode,
             int                   n_ctx,
+            int                   n_orig_ctx,
             float                 freq_base,
-            float                 freq_scale);
+            float                 freq_scale,
+            float                 ext_factor,
+            float                 attn_factor,
+            float                 beta_fast,
+            float                 beta_slow);
 
     // in-place, returns view(a)
     GGML_API struct ggml_tensor * ggml_rope_custom_inplace(
@@ -1332,8 +1342,17 @@ extern "C" {
             int                   n_dims,
             int                   mode,
             int                   n_ctx,
+            int                   n_orig_ctx,
             float                 freq_base,
-            float                 freq_scale);
+            float                 freq_scale,
+            float                 ext_factor,
+            float                 attn_factor,
+            float                 beta_fast,
+            float                 beta_slow);
+
+    // compute correction dims for YaRN RoPE scaling
+    void ggml_rope_yarn_corr_dims(
+        int n_dims, int n_orig_ctx, float freq_base, float beta_fast, float beta_slow, float dims[2]);
 
     // xPos RoPE, in-place, returns view(a)
     GGML_API struct ggml_tensor * ggml_rope_xpos_inplace(
@@ -1925,12 +1944,19 @@ extern "C" {
     // quantization
     //
 
+    // TODO: these would probably get removed in favor of the more general ggml_quantize_chunk
     GGML_API size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t * hist);
     GGML_API size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * hist);
     GGML_API size_t ggml_quantize_q5_0(const float * src, void * dst, int n, int k, int64_t * hist);
     GGML_API size_t ggml_quantize_q5_1(const float * src, void * dst, int n, int k, int64_t * hist);
     GGML_API size_t ggml_quantize_q8_0(const float * src, void * dst, int n, int k, int64_t * hist);
 
+    GGML_API size_t ggml_quantize_q2_K(const float * src, void * dst, int n, int k, int64_t * hist);
+    GGML_API size_t ggml_quantize_q3_K(const float * src, void * dst, int n, int k, int64_t * hist);
+    GGML_API size_t ggml_quantize_q4_K(const float * src, void * dst, int n, int k, int64_t * hist);
+    GGML_API size_t ggml_quantize_q5_K(const float * src, void * dst, int n, int k, int64_t * hist);
+    GGML_API size_t ggml_quantize_q6_K(const float * src, void * dst, int n, int k, int64_t * hist);
+
     GGML_API size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, int start, int n, int64_t * hist);
 
     //
diff --git a/gguf-py/gguf/gguf.py b/gguf-py/gguf/gguf.py
index 557ce7ac0..727b4e554 100644
--- a/gguf-py/gguf/gguf.py
+++ b/gguf-py/gguf/gguf.py
@@ -7,7 +7,7 @@ import shutil
 import struct
 import sys
 import tempfile
-from enum import IntEnum, auto
+from enum import Enum, IntEnum, auto
 from io import BufferedWriter
 from pathlib import Path
 from typing import IO, Any, BinaryIO, Callable, Sequence
@@ -19,9 +19,10 @@ import numpy as np
 #
 
 GGUF_MAGIC             = 0x46554747
-GGUF_VERSION           = 2
+GGUF_VERSION           = 3
 GGUF_DEFAULT_ALIGNMENT = 32
 
+
 # general
 KEY_GENERAL_ARCHITECTURE         = "general.architecture"
 KEY_GENERAL_QUANTIZATION_VERSION = "general.quantization_version"
@@ -52,9 +53,12 @@ KEY_ATTENTION_LAYERNORM_EPS     = "{arch}.attention.layer_norm_epsilon"
 KEY_ATTENTION_LAYERNORM_RMS_EPS = "{arch}.attention.layer_norm_rms_epsilon"
 
 # RoPE
-KEY_ROPE_DIMENSION_COUNT = "{arch}.rope.dimension_count"
-KEY_ROPE_FREQ_BASE       = "{arch}.rope.freq_base"
-KEY_ROPE_SCALE_LINEAR    = "{arch}.rope.scale_linear"
+KEY_ROPE_DIMENSION_COUNT         = "{arch}.rope.dimension_count"
+KEY_ROPE_FREQ_BASE               = "{arch}.rope.freq_base"
+KEY_ROPE_SCALING_TYPE            = "{arch}.rope.scaling.type"
+KEY_ROPE_SCALING_FACTOR          = "{arch}.rope.scaling.factor"
+KEY_ROPE_SCALING_ORIG_CTX_LEN    = "{arch}.rope.scaling.original_context_length"
+KEY_ROPE_SCALING_FINETUNED       = "{arch}.rope.scaling.finetuned"
 
 # tokenization
 KEY_TOKENIZER_MODEL      = "tokenizer.ggml.model"
@@ -576,6 +580,11 @@ class TokenType(IntEnum):
     UNUSED       = 5
     BYTE         = 6
 
+class RopeScalingType(Enum):
+    NONE   = 'none'
+    LINEAR = 'linear'
+    YARN   = 'yarn'
+
 #
 # implementation
 #
@@ -597,6 +606,10 @@ class GGMLQuantizationType(IntEnum):
     Q6_K = 14
     Q8_K = 15
 
+class GGUFEndian(IntEnum):
+    LITTLE = 0
+    BIG = 1
+
 
 class GGUFValueType(IntEnum):
     UINT8   = 0
@@ -644,18 +657,41 @@ class GGUFWriter:
     temp_file: tempfile.SpooledTemporaryFile[bytes] | None = None
     tensors: list[tuple[np.ndarray[Any, Any], int]]
 
-    def __init__(self, path: os.PathLike[str] | str, arch: str, use_temp_file = True):
+    @property
+    def pack_prefix(self):
+        if self.endianess==GGUFEndian.LITTLE:
+            return "<"
+        else:
+            return ">"
+
+    def __init__(self, path: os.PathLike[str] | str, arch: str, use_temp_file = True, endianess=GGUFEndian.LITTLE):
         self.fout = open(path, "wb")
         self.arch = arch
+        self.endianess = endianess
+        self._simple_value_packing = {
+            GGUFValueType.UINT8:   f"{self.pack_prefix}B",
+            GGUFValueType.INT8:    f"{self.pack_prefix}b",
+            GGUFValueType.UINT16:  f"{self.pack_prefix}H",
+            GGUFValueType.INT16:   f"{self.pack_prefix}h",
+            GGUFValueType.UINT32:  f"{self.pack_prefix}I",
+            GGUFValueType.INT32:   f"{self.pack_prefix}i",
+            GGUFValueType.FLOAT32: f"{self.pack_prefix}f",
+            GGUFValueType.UINT64:  f"{self.pack_prefix}Q",
+            GGUFValueType.INT64:   f"{self.pack_prefix}q",
+            GGUFValueType.FLOAT64: f"{self.pack_prefix}d",
+            GGUFValueType.BOOL:    "?" ,
+        }
         self.add_architecture()
         self.use_temp_file = use_temp_file
         self.tensors = []
+        endianess_str = "Big Endian" if self.endianess == GGUFEndian.BIG else "Little Endian"
+        print(f"This gguf file is for {endianess_str} only")
 
     def write_header_to_file(self):
         self.fout.write(struct.pack("<I", GGUF_MAGIC))
-        self.fout.write(struct.pack("<I", GGUF_VERSION))
-        self.fout.write(struct.pack("<Q", self.ti_data_count))
-        self.fout.write(struct.pack("<Q", self.kv_data_count))
+        self.fout.write(struct.pack(f"{self.pack_prefix}I", GGUF_VERSION))
+        self.fout.write(struct.pack(f"{self.pack_prefix}Q", self.ti_data_count))
+        self.fout.write(struct.pack(f"{self.pack_prefix}Q", self.kv_data_count))
         self.flush()
 #        print("tensors " + str(self.ti_data_count) + " kv " + str(self.kv_data_count))
 
@@ -727,25 +763,12 @@ class GGUFWriter:
         self.add_key(key)
         self.add_val(val, GGUFValueType.ARRAY)
 
-    _simple_value_packing = {
-        GGUFValueType.UINT8:   "<B",
-        GGUFValueType.INT8:    "<b",
-        GGUFValueType.UINT16:  "<H",
-        GGUFValueType.INT16:   "<h",
-        GGUFValueType.UINT32:  "<I",
-        GGUFValueType.INT32:   "<i",
-        GGUFValueType.FLOAT32: "<f",
-        GGUFValueType.UINT64:  "<Q",
-        GGUFValueType.INT64:   "<q",
-        GGUFValueType.FLOAT64: "<d",
-        GGUFValueType.BOOL:    "?" ,
-    }
     def add_val(self, val: Any, vtype: GGUFValueType | None = None, add_vtype: bool = True):
         if vtype is None:
             vtype = GGUFValueType.get_type(val)
 
         if add_vtype:
-            self.kv_data += struct.pack("<I", vtype)
+            self.kv_data += struct.pack(f"{self.pack_prefix}I", vtype)
             self.kv_data_count += 1
 
         pack_fmt = self._simple_value_packing.get(vtype)
@@ -753,14 +776,14 @@ class GGUFWriter:
             self.kv_data += struct.pack(pack_fmt, val)
         elif vtype == GGUFValueType.STRING:
             encoded_val = val.encode("utf8") if isinstance(val, str) else val
-            self.kv_data += struct.pack("<Q", len(encoded_val))
+            self.kv_data += struct.pack(f"{self.pack_prefix}Q", len(encoded_val))
             self.kv_data += encoded_val
         elif vtype == GGUFValueType.ARRAY and isinstance(val, Sequence) and len(val) > 0:
             ltype = GGUFValueType.get_type(val[0])
             if not all(GGUFValueType.get_type(i) is ltype for i in val[1:]):
                 raise ValueError("All items in a GGUF array should be of the same type")
-            self.kv_data += struct.pack("<I", ltype)
-            self.kv_data += struct.pack("<Q", len(val))
+            self.kv_data += struct.pack(f"{self.pack_prefix}I", ltype)
+            self.kv_data += struct.pack(f"{self.pack_prefix}Q", len(val))
             for item in val:
                 self.add_val(item, add_vtype=False)
         else:
@@ -774,22 +797,24 @@ class GGUFWriter:
         assert raw_dtype is not None or tensor_dtype in (np.float32, np.float16), "Only F32 and F16 tensors are supported for now"
 
         encoded_name = name.encode("utf8")
-        self.ti_data += struct.pack("<Q", len(encoded_name))
+        self.ti_data += struct.pack(f"{self.pack_prefix}Q", len(encoded_name))
         self.ti_data += encoded_name
         n_dims = len(tensor_shape)
-        self.ti_data += struct.pack("<I", n_dims)
+        self.ti_data += struct.pack(f"{self.pack_prefix}I", n_dims)
         for i in range(n_dims):
-            self.ti_data += struct.pack("<Q", tensor_shape[n_dims - 1 - i])
+            self.ti_data += struct.pack(f"{self.pack_prefix}Q", tensor_shape[n_dims - 1 - i])
         if raw_dtype is None:
             dtype = GGMLQuantizationType.F32 if tensor_dtype == np.float32 else GGMLQuantizationType.F16
         else:
             dtype = raw_dtype
-        self.ti_data += struct.pack("<I", dtype)
-        self.ti_data += struct.pack("<Q", self.offset_tensor)
+        self.ti_data += struct.pack(f"{self.pack_prefix}I", dtype)
+        self.ti_data += struct.pack(f"{self.pack_prefix}Q", self.offset_tensor)
         self.offset_tensor += GGUFWriter.ggml_pad(tensor_nbytes, self.data_alignment)
         self.ti_data_count += 1
 
     def add_tensor(self, name: str, tensor: np.ndarray[Any, Any], raw_shape: Sequence[int] | None = None, raw_dtype: GGMLQuantizationType | None = None):
+        if self.endianess == GGUFEndian.BIG:
+            tensor.byteswap(inplace=True)
         if self.use_temp_file and self.temp_file is None:
             fp = tempfile.SpooledTemporaryFile(mode="w+b", max_size=256*1024*1024)
             fp.seek(0)
@@ -815,6 +840,8 @@ class GGUFWriter:
             fp.write(bytes([0] * pad))
 
     def write_tensor_data(self, tensor: np.ndarray[Any, Any]):
+        if self.endianess==GGUFEndian.BIG:
+            tensor.byteswap(inplace=True)
         self.write_padding(self.fout, self.fout.tell())
         tensor.tofile(self.fout)
         self.write_padding(self.fout, tensor.nbytes)
@@ -929,8 +956,17 @@ class GGUFWriter:
     def add_rope_freq_base(self, value: float):
         self.add_float32(KEY_ROPE_FREQ_BASE.format(arch=self.arch), value)
 
-    def add_rope_scale_linear(self, value: float):
-        self.add_float32(KEY_ROPE_SCALE_LINEAR.format(arch=self.arch), value)
+    def add_rope_scaling_type(self, value: RopeScalingType):
+        self.add_string(KEY_ROPE_SCALING_TYPE.format(arch=self.arch), value.value)
+
+    def add_rope_scaling_factor(self, value: float):
+        self.add_float32(KEY_ROPE_SCALING_FACTOR.format(arch=self.arch), value)
+
+    def add_rope_scaling_orig_ctx_len(self, value: int):
+        self.add_uint32(KEY_ROPE_SCALING_ORIG_CTX_LEN.format(arch=self.arch), value)
+
+    def add_rope_scaling_finetuned(self, value: bool):
+        self.add_bool(KEY_ROPE_SCALING_FINETUNED.format(arch=self.arch), value)
 
     def add_tokenizer_model(self, model: str):
         self.add_string(KEY_TOKENIZER_MODEL, model)
@@ -968,12 +1004,15 @@ class SpecialVocab:
     merges: list[str] = []
     special_token_types: tuple[str, ...] = ('bos', 'eos', 'unk', 'sep', 'pad')
     special_token_ids: dict[str, int] = {}
+    n_vocab: int | None = None
 
     def __init__(
         self, path: str | os.PathLike[str], load_merges: bool = False,
         special_token_types: tuple[str, ...] | None = None,
+        n_vocab: int | None = None,
     ):
         self.special_token_ids = {}
+        self.n_vocab = n_vocab
         self.load_merges = load_merges
         if special_token_types is not None:
             self.special_token_types = special_token_types
@@ -983,6 +1022,16 @@ class SpecialVocab:
         if not self._try_load_from_tokenizer_json(path):
             self._try_load_from_config_json(path)
 
+    def _set_special_token(self, typ: str, tid: Any):
+        if not isinstance(tid, int) or tid < 0:
+            return
+        if self.n_vocab is None or tid < self.n_vocab:
+            self.special_token_ids[typ] = tid
+            return
+        print(f'gguf: WARNING: Special token type {typ}, id {tid} out of range, must be under {self.n_vocab} - skipping',
+            file = sys.stderr)
+
+
     def _try_load_from_tokenizer_json(self, path: Path) -> bool:
         tokenizer_file = path / 'tokenizer.json'
         if not tokenizer_file.is_file():
@@ -1010,10 +1059,11 @@ class SpecialVocab:
                 tc_content = entry_content
             else:
                 continue
-            for maybe_token_id in (atok.get('id') for atok in added_tokens if atok.get('content') == tc_content):
-                if isinstance(maybe_token_id, int) and maybe_token_id >= 0:
-                    self.special_token_ids[typ] = maybe_token_id
-                break
+            # We only need the first match here.
+            maybe_token_id = next((
+                atok.get('id') for atok in added_tokens
+                if atok.get('content') == tc_content), None)
+            self._set_special_token(typ, maybe_token_id)
         return True
 
     def _try_load_from_config_json(self, path: Path) -> bool:
@@ -1023,21 +1073,21 @@ class SpecialVocab:
         with open(config_file, encoding = 'utf-8') as f:
             config = json.load(f)
         for typ in self.special_token_types:
-            maybe_token_id = config.get(f'{typ}_token_id')
-            if isinstance(maybe_token_id, int) and maybe_token_id >= 0:
-                self.special_token_ids[typ] = maybe_token_id
+            self._set_special_token(typ, config.get(f'{typ}_token_id'))
         return True
 
-    def add_to_gguf(self, gw: GGUFWriter) -> None:
+    def add_to_gguf(self, gw: GGUFWriter, quiet: bool = False) -> None:
         if len(self.merges) > 0:
-            print(f'gguf: Adding {len(self.merges)} merge(s).')
+            if not quiet:
+                print(f'gguf: Adding {len(self.merges)} merge(s).')
             gw.add_token_merges(self.merges)
         for typ, tokid in self.special_token_ids.items():
             handler: Callable[[int], None] | None = getattr(gw, f'add_{typ}_token_id', None)
             if handler is None:
-                print(f'gguf: WARNING: No handler for special token type {typ} with id {tokid} - skipping')
+                print(f'gguf: WARNING: No handler for special token type {typ} with id {tokid} - skipping', file = sys.stderr)
                 continue
-            print(f'gguf: Setting special token type {typ} to {tokid}')
+            if not quiet:
+                print(f'gguf: Setting special token type {typ} to {tokid}')
             handler(tokid)
 
     def __repr__(self) -> str:
diff --git a/gguf-py/pyproject.toml b/gguf-py/pyproject.toml
index 07a7ab4dd..f0741a7c2 100644
--- a/gguf-py/pyproject.toml
+++ b/gguf-py/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "gguf"
-version = "0.4.4"
+version = "0.4.5"
 description = "Write ML models in GGUF for GGML"
 authors = ["GGML <ggml@ggml.ai>"]
 packages = [
diff --git a/llama.cpp b/llama.cpp
index 7ed872237..bb60044b4 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -19,13 +19,11 @@
 #ifdef GGML_USE_MPI
 #  include "ggml-mpi.h"
 #endif
-#ifdef GGML_USE_K_QUANTS
-#  ifndef QK_K
-#    ifdef GGML_QKK_64
-#      define QK_K 64
-#    else
-#      define QK_K 256
-#    endif
+#ifndef QK_K
+#  ifdef GGML_QKK_64
+#    define QK_K 64
+#  else
+#    define QK_K 256
 #  endif
 #endif
 
@@ -56,13 +54,16 @@
 #include <cassert>
 #include <cinttypes>
 #include <climits>
+#include <cmath>
 #include <cstdarg>
 #include <cstddef>
 #include <cstdint>
 #include <cstdio>
 #include <cstring>
 #include <ctime>
+#include <forward_list>
 #include <fstream>
+#include <functional>
 #include <initializer_list>
 #include <map>
 #include <memory>
@@ -71,10 +72,10 @@
 #include <queue>
 #include <random>
 #include <regex>
+#include <set>
 #include <sstream>
 #include <thread>
 #include <unordered_map>
-#include <set>
 
 #if defined(_MSC_VER)
 #pragma warning(disable: 4244 4267) // possible loss of data
@@ -235,6 +236,10 @@ enum llm_kv {
     LLM_KV_ROPE_DIMENSION_COUNT,
     LLM_KV_ROPE_FREQ_BASE,
     LLM_KV_ROPE_SCALE_LINEAR,
+    LLM_KV_ROPE_SCALING_TYPE,
+    LLM_KV_ROPE_SCALING_FACTOR,
+    LLM_KV_ROPE_SCALING_ORIG_CTX_LEN,
+    LLM_KV_ROPE_SCALING_FINETUNED,
 
     LLM_KV_TOKENIZER_MODEL,
     LLM_KV_TOKENIZER_LIST,
@@ -276,9 +281,13 @@ static std::map<llm_kv, std::string> LLM_KV_NAMES = {
     { LLM_KV_ATTENTION_LAYERNORM_EPS,       "%s.attention.layer_norm_epsilon"     },
     { LLM_KV_ATTENTION_LAYERNORM_RMS_EPS,   "%s.attention.layer_norm_rms_epsilon" },
 
-    { LLM_KV_ROPE_DIMENSION_COUNT,          "%s.rope.dimension_count" },
-    { LLM_KV_ROPE_FREQ_BASE,                "%s.rope.freq_base"       },
-    { LLM_KV_ROPE_SCALE_LINEAR,             "%s.rope.scale_linear"    },
+    { LLM_KV_ROPE_DIMENSION_COUNT,          "%s.rope.dimension_count"                 },
+    { LLM_KV_ROPE_FREQ_BASE,                "%s.rope.freq_base"                       },
+    { LLM_KV_ROPE_SCALE_LINEAR,             "%s.rope.scale_linear"                    },
+    { LLM_KV_ROPE_SCALING_TYPE,             "%s.rope.scaling.type"                    },
+    { LLM_KV_ROPE_SCALING_FACTOR,           "%s.rope.scaling.factor"                  },
+    { LLM_KV_ROPE_SCALING_ORIG_CTX_LEN,     "%s.rope.scaling.original_context_length" },
+    { LLM_KV_ROPE_SCALING_FINETUNED,        "%s.rope.scaling.finetuned"               },
 
     { LLM_KV_TOKENIZER_MODEL,               "tokenizer.ggml.model"              },
     { LLM_KV_TOKENIZER_LIST,                "tokenizer.ggml.tokens"             },
@@ -552,6 +561,22 @@ do { \
     } \
 } while (0)
 
+static std::map<int8_t, std::string> LLAMA_ROPE_SCALING_TYPES = {
+    { LLAMA_ROPE_SCALING_NONE,   "none"   },
+    { LLAMA_ROPE_SCALING_LINEAR, "linear" },
+    { LLAMA_ROPE_SCALING_YARN,   "yarn"   },
+};
+
+static int8_t llama_rope_scaling_type_from_string(const std::string & name) {
+    for (const auto & kv : LLAMA_ROPE_SCALING_TYPES) {
+        if (kv.second == name) {
+            return kv.first;
+        }
+    }
+
+    return LLAMA_ROPE_SCALING_UNSPECIFIED;
+}
+
 //
 // ggml helpers
 //
@@ -970,18 +995,19 @@ struct llama_mlock {
 
 typedef void (*offload_func_t)(struct ggml_tensor * tensor);
 
-static void llama_nop(struct ggml_tensor * tensor) { // don't offload by default
+static void ggml_offload_nop(struct ggml_tensor * tensor) {
     (void) tensor;
 }
 
-static std::string llama_token_to_str(const struct llama_context * ctx, llama_token token) {
+static std::string llama_token_to_piece(const struct llama_context * ctx, llama_token token) {
     std::vector<char> result(8, 0);
     const int n_tokens = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size());
     if (n_tokens < 0) {
         result.resize(-n_tokens);
         int check = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size());
         GGML_ASSERT(check == -n_tokens);
-    } else {
+    }
+    else {
         result.resize(n_tokens);
     }
 
@@ -1017,8 +1043,8 @@ enum e_model {
 };
 
 static const size_t kB = 1024;
-static const size_t MB = kB*kB;
-static const size_t GB = kB*kB*kB;
+static const size_t MB = 1024*kB;
+static const size_t GB = 1024*MB;
 
 struct llama_hparams {
     bool     vocab_only;
@@ -1034,28 +1060,33 @@ struct llama_hparams {
     float f_norm_eps;
     float f_norm_rms_eps;
 
-    float rope_freq_base_train;
-    float rope_freq_scale_train;
+    float    rope_freq_base_train;
+    float    rope_freq_scale_train;
+    uint32_t n_yarn_orig_ctx;
+    int8_t   rope_scaling_type_train : 3;
+    bool     rope_finetuned : 1;
 
     float f_clamp_kqv;
     float f_max_alibi_bias;
 
     bool operator!=(const llama_hparams & other) const {
-        if (this->vocab_only != other.vocab_only) return true;
-        if (this->n_vocab != other.n_vocab) return true;
+        if (this->vocab_only  != other.vocab_only)  return true;
+        if (this->n_vocab     != other.n_vocab)     return true;
         if (this->n_ctx_train != other.n_ctx_train) return true;
-        if (this->n_embd != other.n_embd) return true;
-        if (this->n_head != other.n_head) return true;
-        if (this->n_head_kv != other.n_head_kv) return true;
-        if (this->n_layer != other.n_layer) return true;
-        if (this->n_rot != other.n_rot) return true;
-        if (this->n_ff != other.n_ff) return true;
+        if (this->n_embd      != other.n_embd)      return true;
+        if (this->n_head      != other.n_head)      return true;
+        if (this->n_head_kv   != other.n_head_kv)   return true;
+        if (this->n_layer     != other.n_layer)     return true;
+        if (this->n_rot       != other.n_rot)       return true;
+        if (this->n_ff        != other.n_ff)        return true;
+        if (this->rope_finetuned  != other.rope_finetuned)  return true;
+        if (this->n_yarn_orig_ctx != other.n_yarn_orig_ctx) return true;
 
         const float EPSILON = 1e-9;
 
-        if (!is_float_close(this->f_norm_eps, other.f_norm_eps, EPSILON)) return true;
-        if (!is_float_close(this->f_norm_rms_eps, other.f_norm_rms_eps, EPSILON)) return true;
-        if (!is_float_close(this->rope_freq_base_train, other.rope_freq_base_train, EPSILON)) return true;
+        if (!is_float_close(this->f_norm_eps,            other.f_norm_eps,            EPSILON)) return true;
+        if (!is_float_close(this->f_norm_rms_eps,        other.f_norm_rms_eps,        EPSILON)) return true;
+        if (!is_float_close(this->rope_freq_base_train,  other.rope_freq_base_train,  EPSILON)) return true;
         if (!is_float_close(this->rope_freq_scale_train, other.rope_freq_scale_train, EPSILON)) return true;
 
         return false;
@@ -1080,8 +1111,16 @@ struct llama_cparams {
     uint32_t n_threads;       // number of threads to use for generation
     uint32_t n_threads_batch; // number of threads to use for batch processing
 
-    float rope_freq_base;
-    float rope_freq_scale;
+    float    rope_freq_base;
+    float    rope_freq_scale;
+
+    uint32_t n_yarn_orig_ctx;
+    // These hyperparameters are not exposed in GGUF, because all
+    // existing YaRN models use the same values for them.
+    float yarn_ext_factor;
+    float yarn_attn_factor;
+    float yarn_beta_fast;
+    float yarn_beta_slow;
 
     bool mul_mat_q;
 };
@@ -1113,13 +1152,13 @@ struct llama_layer {
     struct ggml_tensor * ffn_norm_b;
 
     // ff
-    struct ggml_tensor * w1; // ffn_gate
-    struct ggml_tensor * w2; // ffn_down
-    struct ggml_tensor * w3; // ffn_up
+    struct ggml_tensor * ffn_gate; // w1
+    struct ggml_tensor * ffn_down; // w2
+    struct ggml_tensor * ffn_up;   // w3
 
     // ff bias
-    struct ggml_tensor * b2; // ffn_down
-    struct ggml_tensor * b3; // ffn_up
+    struct ggml_tensor * ffn_down_b; // b2
+    struct ggml_tensor * ffn_up_b;   // b3
 };
 
 struct llama_kv_cell {
@@ -1183,6 +1222,8 @@ struct llama_vocab {
     std::unordered_map<token, id> token_to_id;
     std::vector<token_data>       id_to_token;
 
+    std::unordered_map<token, id> special_tokens_cache;
+
     std::map<std::pair<std::string, std::string>, int> bpe_ranks;
 
     // default LLaMA special tokens
@@ -1192,17 +1233,17 @@ struct llama_vocab {
     id special_sep_id = -1;
     id special_pad_id = -1;
 
-    id linefeed_id = 13;
+    id linefeed_id       = 13;
     id special_prefix_id = 32007;
     id special_middle_id = 32009;
     id special_suffix_id = 32008;
-    id special_eot_id = 32010;
+    id special_eot_id    = 32010;
 
     int find_bpe_rank(std::string token_left, std::string token_right) const {
-        replace_all(token_left,  " ",  "\u0120");
-        replace_all(token_left,  "\n", "\u010A");
-        replace_all(token_right, " ",  "\u0120");
-        replace_all(token_right, "\n", "\u010A");
+        GGML_ASSERT(token_left.find(" ") == std::string::npos);
+        GGML_ASSERT(token_left.find("\n") == std::string::npos);
+        GGML_ASSERT(token_right.find(" ") == std::string::npos);
+        GGML_ASSERT(token_right.find("\n") == std::string::npos);
 
         auto it = bpe_ranks.find(std::make_pair(token_left, token_right));
         if (it == bpe_ranks.end()) {
@@ -1223,8 +1264,8 @@ struct llama_model {
     llama_hparams hparams = {};
     llama_vocab   vocab;
 
-    struct ggml_tensor * tok_embeddings;
-    struct ggml_tensor * pos_embeddings;
+    struct ggml_tensor * tok_embd;
+    struct ggml_tensor * pos_embd;
     struct ggml_tensor * tok_norm;
     struct ggml_tensor * tok_norm_b;
 
@@ -1356,10 +1397,7 @@ static bool llama_kv_cache_init(
     cache.cells.clear();
     cache.cells.resize(n_ctx);
 
-    // TODO: this should be:
-    //       cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*ggml_tensor_overhead());
-    //       change it and test that it works
-    cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*MB);
+    cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*ggml_tensor_overhead());
     memset(cache.buf.data, 0, cache.buf.size);
 
     struct ggml_init_params params;
@@ -1447,7 +1485,10 @@ static bool llama_kv_cache_find_slot(
 
     for (uint32_t i = 0; i < n_tokens; i++) {
         cache.cells[cache.head + i].pos = batch.pos[i];
-        cache.cells[cache.head + i].seq_id.insert(batch.seq_id[i]);
+
+        for (int32_t j = 0; j < batch.n_seq_id[i]; j++) {
+            cache.cells[cache.head + i].seq_id.insert(batch.seq_id[i][j]);
+        }
     }
 
     return true;
@@ -1464,17 +1505,12 @@ static int32_t llama_kv_cache_cell_max(const struct llama_kv_cache & cache) {
     return 0;
 }
 
-static void llama_kv_cache_tokens_rm(struct llama_kv_cache & cache, int32_t c0, int32_t c1) {
-    if (c0 < 0) c0 = 0;
-    if (c1 < 0) c1 = cache.size;
-
-    for (int32_t i = c0; i < c1; ++i) {
+static void llama_kv_cache_clear(struct llama_kv_cache & cache) {
+    for (int32_t i = 0; i < (int32_t) cache.size; ++i) {
         cache.cells[i].pos = -1;
         cache.cells[i].seq_id.clear();
     }
-
-    // Searching for a free slot can start here since we know it will be empty.
-    cache.head = uint32_t(c0);
+    cache.head = 0;
 }
 
 static void llama_kv_cache_seq_rm(
@@ -1488,8 +1524,14 @@ static void llama_kv_cache_seq_rm(
     if (p1 < 0) p1 = std::numeric_limits<llama_pos>::max();
 
     for (uint32_t i = 0; i < cache.size; ++i) {
-        if (cache.cells[i].has_seq_id(seq_id) && cache.cells[i].pos >= p0 && cache.cells[i].pos < p1) {
-            cache.cells[i].seq_id.erase(seq_id);
+        if (cache.cells[i].pos >= p0 && cache.cells[i].pos < p1) {
+            if (seq_id < 0) {
+                cache.cells[i].seq_id.clear();
+            } else if (cache.cells[i].has_seq_id(seq_id)) {
+                cache.cells[i].seq_id.erase(seq_id);
+            } else {
+                continue;
+            }
             if (cache.cells[i].seq_id.empty()) {
                 cache.cells[i].pos = -1;
                 if (new_head == cache.size) new_head = i;
@@ -1527,6 +1569,9 @@ static void llama_kv_cache_seq_keep(struct llama_kv_cache & cache, llama_seq_id
             cache.cells[i].pos = -1;
             cache.cells[i].seq_id.clear();
             if (new_head == cache.size) new_head = i;
+        } else {
+            cache.cells[i].seq_id.clear();
+            cache.cells[i].seq_id.insert(seq_id);
         }
     }
 
@@ -1547,14 +1592,14 @@ static void llama_kv_cache_seq_shift(
 
     for (uint32_t i = 0; i < cache.size; ++i) {
         if (cache.cells[i].has_seq_id(seq_id) && cache.cells[i].pos >= p0 && cache.cells[i].pos < p1) {
-            cache.cells[i].pos += delta;
+            cache.has_shift = true;
+            cache.cells[i].pos   += delta;
+            cache.cells[i].delta += delta;
+
             if (cache.cells[i].pos < 0) {
                 cache.cells[i].pos = -1;
                 cache.cells[i].seq_id.clear();
                 if (new_head == cache.size) new_head = i;
-            } else {
-                cache.has_shift = true;
-                cache.cells[i].delta = delta;
             }
         }
     }
@@ -1571,12 +1616,14 @@ static void llama_kv_cache_seq_shift(
 enum llama_fver {
     GGUF_FILE_VERSION_V1 = 1,
     GGUF_FILE_VERSION_V2 = 2,
+    GGUF_FILE_VERSION_V3 = 3,
 };
 
 static const char * llama_file_version_name(llama_fver version) {
     switch (version) {
         case GGUF_FILE_VERSION_V1: return "GGUF V1 (support until nov 2023)";
-        case GGUF_FILE_VERSION_V2: return "GGUF V2 (latest)";
+        case GGUF_FILE_VERSION_V2: return "GGUF V2";
+        case GGUF_FILE_VERSION_V3: return "GGUF V3 (latest)";
     }
 
     return "unknown";
@@ -1790,6 +1837,12 @@ struct llama_model_loader {
             throw std::runtime_error(format("%s: tensor '%s' not found", __func__, name.c_str()));
         }
 
+        if (backend == GGML_BACKEND_GPU_SPLIT) {
+            if (ne.size() == 1) {
+                throw std::runtime_error(format("%s: 1-dimensional tensor '%s' cannot be split on the GPU", __func__, name.c_str()));
+            }
+        }
+
         {
             bool is_ok = true;
             for (size_t i = 0; i < ne.size(); ++i) {
@@ -2005,14 +2058,30 @@ static void llm_load_hparams(
     hparams.n_head_kv = hparams.n_head;
     GGUF_GET_KEY(ctx, hparams.n_head_kv, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_ATTENTION_HEAD_COUNT_KV));
 
+    hparams.rope_finetuned = false;
+    GGUF_GET_KEY(ctx, hparams.rope_finetuned, gguf_get_val_bool, GGUF_TYPE_BOOL, false,
+                 kv(LLM_KV_ROPE_SCALING_FINETUNED));
+
+    hparams.n_yarn_orig_ctx = hparams.n_ctx_train;
+    GGUF_GET_KEY(ctx, hparams.n_yarn_orig_ctx, gguf_get_val_u32, GGUF_TYPE_UINT32, false,
+                 kv(LLM_KV_ROPE_SCALING_ORIG_CTX_LEN));
+
     // rope_freq_base (optional)
     hparams.rope_freq_base_train = 10000.0f;
     GGUF_GET_KEY(ctx, hparams.rope_freq_base_train, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_FREQ_BASE));
 
+    std::string rope_scaling("linear");
+    GGUF_GET_KEY(ctx, rope_scaling, gguf_get_val_str, GGUF_TYPE_STRING, false, kv(LLM_KV_ROPE_SCALING_TYPE));
+    hparams.rope_scaling_type_train = llama_rope_scaling_type_from_string(rope_scaling);
+    GGML_ASSERT(hparams.rope_scaling_type_train != LLAMA_ROPE_SCALING_UNSPECIFIED);
+
     // rope_freq_scale (inverse of the kv) is optional
-    float ropescale = 1.0f;
-    GGUF_GET_KEY(ctx, ropescale, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_SCALE_LINEAR));
-    hparams.rope_freq_scale_train = 1.0f/ropescale;
+    float ropescale = 0.0f;
+    GGUF_GET_KEY(ctx, ropescale, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_SCALING_FACTOR));
+    if (ropescale == 0.0f) { // try the old key name
+        GGUF_GET_KEY(ctx, ropescale, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_SCALE_LINEAR));
+    }
+    hparams.rope_freq_scale_train = ropescale == 0.0f ? 1.0f : 1.0f/ropescale;
 
     // sanity check for n_rot (optional)
     {
@@ -2125,7 +2194,7 @@ static void llm_load_hparams(
 }
 
 // TODO: This should probably be in llama.h
-static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab & vocab, std::string raw_text, bool bos);
+static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab & vocab, std::string raw_text, bool bos, bool special = false);
 static llama_token llama_byte_to_token(const llama_vocab & vocab, uint8_t ch);
 
 static void llm_load_vocab(
@@ -2232,21 +2301,138 @@ static void llm_load_vocab(
     if (vocab.type == LLAMA_VOCAB_TYPE_SPM) {
         vocab.linefeed_id = llama_byte_to_token(vocab, '\n');
     } else {
-        vocab.linefeed_id = llama_tokenize_internal(vocab, "\u010A", false)[0];
+        const std::vector<int> ids = llama_tokenize_internal(vocab, "\u010A", false);
+        GGML_ASSERT(!ids.empty() && "model vocab missing newline token");
+        vocab.linefeed_id = ids[0];
     }
 
     // special tokens
-    GGUF_GET_KEY(ctx, vocab.special_bos_id, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_TOKENIZER_BOS_ID));
-    GGUF_GET_KEY(ctx, vocab.special_eos_id, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_TOKENIZER_EOS_ID));
-    GGUF_GET_KEY(ctx, vocab.special_unk_id, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_TOKENIZER_UNK_ID));
-    GGUF_GET_KEY(ctx, vocab.special_sep_id, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_TOKENIZER_SEP_ID));
-    GGUF_GET_KEY(ctx, vocab.special_pad_id, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_TOKENIZER_PAD_ID));
+    {
+        const std::vector<std::pair<enum llm_kv, int32_t &>> special_token_types = {
+            { LLM_KV_TOKENIZER_BOS_ID, vocab.special_bos_id },
+            { LLM_KV_TOKENIZER_EOS_ID, vocab.special_eos_id },
+            { LLM_KV_TOKENIZER_UNK_ID, vocab.special_unk_id },
+            { LLM_KV_TOKENIZER_SEP_ID, vocab.special_sep_id },
+            { LLM_KV_TOKENIZER_PAD_ID, vocab.special_pad_id },
+        };
+        for (const auto & it : special_token_types) {
+            const std::string & key = kv(std::get<0>(it));
+            int32_t & id = std::get<1>(it), old_id = id;
+
+            GGUF_GET_KEY(ctx, id, gguf_get_val_u32, GGUF_TYPE_UINT32, false, key);
+            // Must be >= -1 and < vocab size. Since the key is unsigned, -1
+            // can only come from the default value, so there's no point in
+            // validating that.
+            if (size_t(id + 1) > vocab.id_to_token.size()) {
+                LLAMA_LOG_WARN("%s: bad special token: '%s' = %d, using default id %d\n",
+                    __func__, key.c_str(), id, old_id);
+                id = old_id;
+            }
+        }
+    }
+
+    // build special tokens cache
+    {
+        // TODO: It is unclear (to me) at this point, whether special tokes are guaranteed to be of a deterministic type,
+        //  and will always be correctly labeled in 'added_tokens.json' etc.
+        // The assumption is, since special tokens aren't meant to be exposed to end user, they are designed
+        //  to be unmatchable by the tokenizer, therefore tokens from the vocab, which are unmatchable by the tokenizer
+        //  are special tokens.
+        // From testing, this appears to corelate 1:1 with special tokens.
+        //
+
+        // Counting special tokens and verifying in only one direction
+        //  is sufficient to detect difference in those two sets.
+        //
+        uint32_t special_tokens_count_by_type = 0;
+        uint32_t special_tokens_count_from_verification = 0;
+
+        bool special_tokens_definition_mismatch = false;
+
+        for (const auto & t : vocab.token_to_id) {
+            const auto & token = t.first;
+            const auto & id    = t.second;
+
+            // Count all non-normal tokens in the vocab while iterating
+            if (vocab.id_to_token[id].type != LLAMA_TOKEN_TYPE_NORMAL) {
+                special_tokens_count_by_type++;
+            }
+
+            // Skip single character tokens
+            if (token.length() > 1) {
+                bool is_tokenizable = false;
+
+                // Split token string representation in two, in all possible ways
+                //  and check if both halves can be matched to a valid token
+                for (unsigned i = 1; i < token.length();) {
+                    const auto left  = token.substr(0, i);
+                    const auto right = token.substr(i);
+
+                    // check if we didnt partition in the middle of a utf sequence
+                    auto utf = utf8_len(left.at(left.length() - 1));
+
+                    if (utf == 1) {
+                        if (vocab.token_to_id.find(left)  != vocab.token_to_id.end() &&
+                            vocab.token_to_id.find(right) != vocab.token_to_id.end() ) {
+                            is_tokenizable = true;
+                            break;
+                        }
+                        i++;
+                    } else {
+                        // skip over the rest of multibyte utf sequence
+                        i += utf - 1;
+                    }
+                }
+
+                if (!is_tokenizable) {
+                    // Some tokens are multibyte, but they are utf sequences with equivalent text length of 1
+                    //  it's faster to re-filter them here, since there are way less candidates now
+
+                    // Calculate a total "utf" length of a token string representation
+                    size_t utf8_str_len = 0;
+                    for (unsigned i = 0; i < token.length();) {
+                        utf8_str_len++;
+                        i += utf8_len(token.at(i));
+                    }
+
+                    // And skip the ones which are one character
+                    if (utf8_str_len > 1) {
+                        // At this point what we have left are special tokens only
+                        vocab.special_tokens_cache[token] = id;
+
+                        // Count manually found special tokens
+                        special_tokens_count_from_verification++;
+
+                        // If this manually found special token is not marked as such, flag a mismatch
+                        if (vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_NORMAL) {
+                            special_tokens_definition_mismatch = true;
+                        }
+                    }
+                }
+            }
+        }
+
+        if (special_tokens_definition_mismatch || special_tokens_count_from_verification != special_tokens_count_by_type) {
+            LLAMA_LOG_WARN("%s: mismatch in special tokens definition ( %u/%zu vs %u/%zu ).\n",
+                __func__,
+                special_tokens_count_from_verification, vocab.id_to_token.size(),
+                special_tokens_count_by_type, vocab.id_to_token.size()
+            );
+        } else {
+            LLAMA_LOG_INFO("%s: special tokens definition check successful ( %u/%zu ).\n",
+                __func__,
+                special_tokens_count_from_verification, vocab.id_to_token.size()
+            );
+        }
+    }
 }
 
 static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
     const auto & hparams = model.hparams;
     const auto & vocab   = model.vocab;
 
+    const auto rope_scaling_type = LLAMA_ROPE_SCALING_TYPES.at(hparams.rope_scaling_type_train);
+
     // hparams
     LLAMA_LOG_INFO("%s: format           = %s\n",     __func__, llama_file_version_name(ml.fver));
     LLAMA_LOG_INFO("%s: arch             = %s\n",     __func__, LLM_ARCH_NAMES.at(model.arch).c_str());
@@ -2265,8 +2451,11 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
     LLAMA_LOG_INFO("%s: f_clamp_kqv      = %.1e\n",   __func__, hparams.f_clamp_kqv);
     LLAMA_LOG_INFO("%s: f_max_alibi_bias = %.1e\n",   __func__, hparams.f_max_alibi_bias);
     LLAMA_LOG_INFO("%s: n_ff             = %u\n",     __func__, hparams.n_ff);
+    LLAMA_LOG_INFO("%s: rope scaling     = %s\n",     __func__, rope_scaling_type.c_str());
     LLAMA_LOG_INFO("%s: freq_base_train  = %.1f\n",   __func__, hparams.rope_freq_base_train);
     LLAMA_LOG_INFO("%s: freq_scale_train = %g\n",     __func__, hparams.rope_freq_scale_train);
+    LLAMA_LOG_INFO("%s: n_yarn_orig_ctx  = %u\n",     __func__, hparams.n_yarn_orig_ctx);
+    LLAMA_LOG_INFO("%s: rope_finetuned   = %s\n",     __func__, hparams.rope_finetuned ? "yes" : "unknown");
     LLAMA_LOG_INFO("%s: model type       = %s\n",     __func__, llama_model_type_name(model.type));
     LLAMA_LOG_INFO("%s: model ftype      = %s\n",     __func__, llama_model_ftype_name(model.ftype).c_str());
     LLAMA_LOG_INFO("%s: model params     = %.2f B\n", __func__, ml.n_elements*1e-9);
@@ -2359,7 +2548,7 @@ static void llm_load_tensors(
             case LLM_ARCH_LLAMA:
             case LLM_ARCH_REFACT:
                 {
-                    model.tok_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
+                    model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
 
                     // output
                     {
@@ -2413,21 +2602,21 @@ static void llm_load_tensors(
 
                         layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);
 
-                        layer.w1 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, backend_split);
-                        layer.w2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, backend_split);
-                        layer.w3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, backend_split);
+                        layer.ffn_gate = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, backend_split);
+                        layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, backend_split);
+                        layer.ffn_up   = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, backend_split);
 
                         if (backend == GGML_BACKEND_GPU) {
                             vram_weights +=
-                                ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.wq) + ggml_nbytes(layer.wk)       +
-                                ggml_nbytes(layer.wv)        + ggml_nbytes(layer.wo) + ggml_nbytes(layer.ffn_norm) +
-                                ggml_nbytes(layer.w1)        + ggml_nbytes(layer.w2) + ggml_nbytes(layer.w3);
+                                ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.wq)       + ggml_nbytes(layer.wk)       +
+                                ggml_nbytes(layer.wv)        + ggml_nbytes(layer.wo)       + ggml_nbytes(layer.ffn_norm) +
+                                ggml_nbytes(layer.ffn_gate)  + ggml_nbytes(layer.ffn_down) + ggml_nbytes(layer.ffn_up);
                         }
                     }
                 } break;
             case LLM_ARCH_BAICHUAN:
                 {
-                    model.tok_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
+                    model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
                     {
                         ggml_backend_type backend_norm;
                         ggml_backend_type backend_output;
@@ -2479,15 +2668,15 @@ static void llm_load_tensors(
 
                         layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);
 
-                        layer.w1 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, backend_split);
-                        layer.w2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, backend_split);
-                        layer.w3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, backend_split);
+                        layer.ffn_gate = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, backend_split);
+                        layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, backend_split);
+                        layer.ffn_up   = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, backend_split);
 
                         if (backend == GGML_BACKEND_GPU) {
                             vram_weights +=
-                                ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.wq) + ggml_nbytes(layer.wk)       +
-                                ggml_nbytes(layer.wv)        + ggml_nbytes(layer.wo) + ggml_nbytes(layer.ffn_norm) +
-                                ggml_nbytes(layer.w1)        + ggml_nbytes(layer.w2) + ggml_nbytes(layer.w3);
+                                ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.wq)       + ggml_nbytes(layer.wk)       +
+                                ggml_nbytes(layer.wv)        + ggml_nbytes(layer.wo)       + ggml_nbytes(layer.ffn_norm) +
+                                ggml_nbytes(layer.ffn_gate)  + ggml_nbytes(layer.ffn_down) + ggml_nbytes(layer.ffn_up);
                         }
                     }
                 } break;
@@ -2495,7 +2684,7 @@ static void llm_load_tensors(
                 {
                     // TODO: CPU-only for now
 
-                    model.tok_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
+                    model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
 
                     // output
                     {
@@ -2558,21 +2747,21 @@ static void llm_load_tensors(
                         layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, backend_split);
                         layer.wo   = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd},                backend_split);
 
-                        layer.w2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, backend_split);
-                        layer.w3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, backend_split);
+                        layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, backend_split);
+                        layer.ffn_up   = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, backend_split);
 
                         if (backend == GGML_BACKEND_GPU) {
                             vram_weights +=
                                 ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.attn_norm_b) +
                                 ggml_nbytes(layer.wqkv)      + ggml_nbytes(layer.wo)          +
-                                ggml_nbytes(layer.w2)        + ggml_nbytes(layer.w3);
+                                ggml_nbytes(layer.ffn_down)  + ggml_nbytes(layer.ffn_up);
                         }
                     }
                 } break;
             case LLM_ARCH_STARCODER:
                 {
-                    model.tok_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
-                    model.pos_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, hparams.n_ctx_train}, GGML_BACKEND_CPU);
+                    model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab},             GGML_BACKEND_CPU);
+                    model.pos_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_POS_EMBD, "weight"),   {n_embd, hparams.n_ctx_train}, GGML_BACKEND_CPU);
 
                     // output
                     {
@@ -2623,19 +2812,19 @@ static void llm_load_tensors(
                         layer.attn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM,   "bias", i),   {n_embd}, backend);
 
                         layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, backend_split);
-                        layer.bqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "bias", i),   {n_embd + 2*n_embd_gqa},         backend_split);
+                        layer.bqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "bias", i),   {n_embd + 2*n_embd_gqa},         backend);
 
                         layer.wo   = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd},   backend_split);
-                        layer.bo   = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "bias", i),   {n_embd},           backend_split);
+                        layer.bo   = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "bias", i),   {n_embd},           backend);
 
                         layer.ffn_norm   = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);
                         layer.ffn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "bias", i),   {n_embd}, backend);
 
-                        layer.w2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, backend_split);
-                        layer.b2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "bias", i),   {n_embd},       backend_split);
+                        layer.ffn_down   = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, backend_split);
+                        layer.ffn_down_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "bias", i),   {n_embd},       backend);
 
-                        layer.w3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, backend_split);
-                        layer.b3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP,   "bias", i),   {n_ff},           backend_split);
+                        layer.ffn_up   = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd, n_ff}, backend_split);
+                        layer.ffn_up_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP,   "bias", i),           {n_ff}, backend);
 
                         if (backend == GGML_BACKEND_GPU) {
                             vram_weights +=
@@ -2643,14 +2832,14 @@ static void llm_load_tensors(
                                 ggml_nbytes(layer.wqkv)      + ggml_nbytes(layer.bqkv)        +
                                 ggml_nbytes(layer.wo)        + ggml_nbytes(layer.bo)          +
                                 ggml_nbytes(layer.ffn_norm)  + ggml_nbytes(layer.ffn_norm_b)  +
-                                ggml_nbytes(layer.w2)        + ggml_nbytes(layer.b2)          +
-                                ggml_nbytes(layer.w3)        + ggml_nbytes(layer.b3);
+                                ggml_nbytes(layer.ffn_down)  + ggml_nbytes(layer.ffn_down_b)  +
+                                ggml_nbytes(layer.ffn_up)    + ggml_nbytes(layer.ffn_up_b);
                         }
                     }
                 } break;
             case LLM_ARCH_PERSIMMON:
                 {
-                    model.tok_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"),  {n_embd, n_vocab}, GGML_BACKEND_CPU);
+                    model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"),  {n_embd, n_vocab}, GGML_BACKEND_CPU);
 
                     {
                         ggml_backend_type backend_norm;
@@ -2691,31 +2880,31 @@ static void llm_load_tensors(
                         const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
                         const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT;
                         auto & layer = model.layers[i];
-                        layer.attn_norm   = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend);
-                        layer.attn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "bias", i),   {n_embd}, backend);
-                        layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, backend_split);
-                        layer.bqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "bias", i),   {n_embd + 2*n_embd_gqa},         backend_split);
-                        layer.wo   = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd},   backend_split);
-                        layer.bo   = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "bias", i),   {n_embd},           backend_split);
-                        layer.w2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, backend_split);
-                        layer.b2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "bias", i),   {n_embd},       backend_split);
-                        layer.w3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, backend_split);
-                        layer.b3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP,   "bias", i),   {n_ff},           backend_split);
-                        layer.ffn_norm   = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);
-                        layer.ffn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "bias", i),   {n_embd}, backend);
+                        layer.attn_norm     = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM,   "weight", i), {n_embd}, backend);
+                        layer.attn_norm_b   = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM,   "bias",   i), {n_embd}, backend);
+                        layer.wqkv          = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV,    "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, backend_split);
+                        layer.bqkv          = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV,    "bias",   i), {n_embd + 2*n_embd_gqa},         backend);
+                        layer.wo            = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT,    "weight", i), {n_embd, n_embd},   backend_split);
+                        layer.bo            = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT,    "bias",   i), {n_embd},           backend);
+                        layer.ffn_down      = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN,    "weight", i), {n_ff, n_embd}, backend_split);
+                        layer.ffn_down_b    = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN,    "bias",   i), {n_embd},       backend);
+                        layer.ffn_up        = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP,      "weight", i), {n_embd,   n_ff}, backend_split);
+                        layer.ffn_up_b      = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP,      "bias",   i), {n_ff},           backend);
+                        layer.ffn_norm      = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM,    "weight", i), {n_embd}, backend);
+                        layer.ffn_norm_b    = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM,    "bias",   i), {n_embd}, backend);
                         layer.attn_q_norm   = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {64}, backend);
-                        layer.attn_q_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i),   {64}, backend);
+                        layer.attn_q_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_Q_NORM, "bias",   i), {64}, backend);
                         layer.attn_k_norm   = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {64}, backend);
-                        layer.attn_k_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_K_NORM, "bias", i),   {64}, backend);
+                        layer.attn_k_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_K_NORM, "bias",   i), {64}, backend);
                     }
                 } break;
             case LLM_ARCH_BLOOM:
                 {
                     // TODO: CPU-only for now
 
-                    model.tok_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD,      "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
-                    model.tok_norm       = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd},          GGML_BACKEND_CPU);
-                    model.tok_norm_b     = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"),   {n_embd},          GGML_BACKEND_CPU);
+                    model.tok_embd   = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD,      "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
+                    model.tok_norm   = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd},          GGML_BACKEND_CPU);
+                    model.tok_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"),   {n_embd},          GGML_BACKEND_CPU);
 
                     // output
                     {
@@ -2766,19 +2955,19 @@ static void llm_load_tensors(
                         layer.attn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM,   "bias", i),   {n_embd}, backend);
 
                         layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, backend_split);
-                        layer.bqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "bias", i),   {n_embd + 2*n_embd_gqa},         backend_split);
+                        layer.bqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "bias", i),   {n_embd + 2*n_embd_gqa},         backend);
 
                         layer.wo   = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd},                backend_split);
-                        layer.bo   = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "bias", i),   {n_embd},                        backend_split);
+                        layer.bo   = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "bias", i),   {n_embd},                        backend);
 
                         layer.ffn_norm   = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);
                         layer.ffn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "bias", i),   {n_embd}, backend);
 
-                        layer.w2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, backend_split);
-                        layer.b2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "bias", i),   {n_embd},       backend_split);
+                        layer.ffn_down   = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, backend_split);
+                        layer.ffn_down_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "bias", i),   {n_embd},       backend);
 
-                        layer.w3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, backend_split);
-                        layer.b3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP,   "bias", i),   {n_ff},           backend_split);
+                        layer.ffn_up   = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, backend_split);
+                        layer.ffn_up_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP,   "bias", i),   {n_ff},           backend);
 
                         if (backend == GGML_BACKEND_GPU) {
                             vram_weights +=
@@ -2786,14 +2975,14 @@ static void llm_load_tensors(
                                 ggml_nbytes(layer.wqkv)      + ggml_nbytes(layer.bqkv)        +
                                 ggml_nbytes(layer.wo)        + ggml_nbytes(layer.bo)          +
                                 ggml_nbytes(layer.ffn_norm)  + ggml_nbytes(layer.ffn_norm_b)  +
-                                ggml_nbytes(layer.w3)        + ggml_nbytes(layer.b3)          +
-                                ggml_nbytes(layer.w2)        + ggml_nbytes(layer.b2);
+                                ggml_nbytes(layer.ffn_up)    + ggml_nbytes(layer.ffn_up_b)    +
+                                ggml_nbytes(layer.ffn_down)  + ggml_nbytes(layer.ffn_down_b);
                         }
                     }
                 } break;
             case LLM_ARCH_MPT:
                 {
-                    model.tok_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
+                    model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
 
                     // output
                     {
@@ -2839,13 +3028,13 @@ static void llm_load_tensors(
                         auto & layer = model.layers[i];
 
                         layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend);
-                        layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, 3*n_embd}, backend_split);
-                        layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd},     backend_split);
+                        layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, backend_split);
+                        layer.wo   = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd},                backend_split);
 
                         layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);
 
-                        layer.w2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, backend_split);
-                        layer.w3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, backend_split);
+                        layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, backend_split);
+                        layer.ffn_up   = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, backend_split);
 
                         if (backend == GGML_BACKEND_GPU) {
                             vram_weights +=
@@ -2853,8 +3042,8 @@ static void llm_load_tensors(
                                 ggml_nbytes(layer.wqkv)      +
                                 ggml_nbytes(layer.wo)        +
                                 ggml_nbytes(layer.ffn_norm)  +
-                                ggml_nbytes(layer.w2)        +
-                                ggml_nbytes(layer.w3);
+                                ggml_nbytes(layer.ffn_down)  +
+                                ggml_nbytes(layer.ffn_up);
                         }
                     }
                 } break;
@@ -2884,10 +3073,10 @@ static void llm_load_tensors(
 
 #ifdef GGML_USE_CUBLAS
         const int max_backend_supported_layers = hparams.n_layer + 3;
-        const int max_offloadable_layers = hparams.n_layer + 3;
-#elif defined(GGML_USE_CLBLAST)
+        const int max_offloadable_layers       = hparams.n_layer + 3;
+#elif GGML_USE_CLBLAST
         const int max_backend_supported_layers = hparams.n_layer + 1;
-        const int max_offloadable_layers = hparams.n_layer + 1;
+        const int max_offloadable_layers       = hparams.n_layer + 1;
 #endif // GGML_USE_CUBLAS
 
         LLAMA_LOG_INFO("%s: offloaded %d/%d layers to GPU\n", __func__, std::min(n_gpu_layers, max_offloadable_layers), max_backend_supported_layers);
@@ -2923,21 +3112,11 @@ static void llm_load_tensors(
     model.t_load_us = ggml_time_us() - model.t_start_us;
 }
 
-static bool llama_model_load(
-        const std::string & fname,
-        llama_model & model,
-        int n_gpu_layers,
-        int main_gpu,
-        const float * tensor_split,
-        bool use_mmap,
-        bool use_mlock,
-        bool vocab_only,
-        llama_progress_callback progress_callback,
-        void *progress_callback_user_data) {
+static bool llama_model_load(const std::string & fname, llama_model & model, const llama_model_params & params) {
     try {
-        llama_model_loader ml(fname, use_mmap);
+        llama_model_loader ml(fname, params.use_mmap);
 
-        model.hparams.vocab_only = vocab_only;
+        model.hparams.vocab_only = params.vocab_only;
 
         llm_load_arch   (ml, model);
         llm_load_hparams(ml, model);
@@ -2949,15 +3128,15 @@ static bool llama_model_load(
             throw std::runtime_error("vocab size mismatch");
         }
 
-        if (vocab_only) {
+        if (params.vocab_only) {
             LLAMA_LOG_INFO("%s: vocab only - skipping tensors\n", __func__);
             return true;
         }
 
         llm_load_tensors(
-                ml, model, n_gpu_layers,
-                main_gpu, tensor_split,
-                use_mlock, progress_callback, progress_callback_user_data);
+            ml, model, params.n_gpu_layers, params.main_gpu, params.tensor_split, params.use_mlock,
+            params.progress_callback, params.progress_callback_user_data
+        );
     } catch (const std::exception & err) {
         LLAMA_LOG_ERROR("error loading model: %s\n", err.what());
         return false;
@@ -2966,2754 +3145,1882 @@ static bool llama_model_load(
     return true;
 }
 
-static struct ggml_cgraph * llm_build_llama(
-    llama_context & lctx,
-    const llama_batch & batch) {
-    const auto & model   = lctx.model;
-    const auto & hparams = model.hparams;
-    const auto & cparams = lctx.cparams;
+//
+// llm_build
+//
 
-    const auto & kv_self = lctx.kv_self;
+using llm_build_cb = std::function<void(struct ggml_tensor * cur, const char * name, int nl)>;
 
-    GGML_ASSERT(!!kv_self.ctx);
+enum llm_rope_type {
+    LLM_ROPE,
+    LLM_ROPE_NEOX,
+    LLM_ROPE_GLM,
+};
 
-    const int64_t n_embd      = hparams.n_embd;
-    const int64_t n_layer     = hparams.n_layer;
-    const int64_t n_ctx       = cparams.n_ctx;
-    const int64_t n_head      = hparams.n_head;
-    const int64_t n_head_kv   = hparams.n_head_kv;
-    const int64_t n_embd_head = hparams.n_embd_head();
-    const int64_t n_embd_gqa  = hparams.n_embd_gqa();
+enum llm_ffn_op_type {
+    LLM_FFN_SILU,
+    LLM_FFN_GELU,
+    LLM_FFN_RELU,
+    LLM_FFN_RELU_SQR,
+};
 
-    GGML_ASSERT(n_embd_head == hparams.n_rot);
+enum llm_ffn_gate_type {
+    LLM_FFN_SEQ,
+    LLM_FFN_PAR, // ffn_gate is parallel to ffn_up
+};
 
-    const float freq_base    = cparams.rope_freq_base;
-    const float freq_scale   = cparams.rope_freq_scale;
-    const float norm_rms_eps = hparams.f_norm_rms_eps;
+enum llm_norm_type {
+    LLM_NORM,
+    LLM_NORM_RMS,
+};
 
-    const int n_gpu_layers = model.n_gpu_layers;
+static struct ggml_tensor * llm_build_inp_embd(
+        struct ggml_context * ctx,
+        const llama_hparams & hparams,
+          const llama_batch & batch,
+         struct ggml_tensor * tok_embd,
+         const llm_build_cb & cb) {
+    const int64_t n_embd = hparams.n_embd;
 
-    const int32_t n_tokens = batch.n_tokens;
-    const int32_t n_kv     = ggml_allocr_is_measure(lctx.alloc) ? n_ctx            : kv_self.n;
-    const int32_t kv_head  = ggml_allocr_is_measure(lctx.alloc) ? n_ctx - n_tokens : kv_self.head;
-
-    const bool do_rope_shift = ggml_allocr_is_measure(lctx.alloc) || kv_self.has_shift;
-
-    //printf("n_kv = %d\n", n_kv);
-
-    auto & buf_compute = lctx.buf_compute;
-
-    struct ggml_init_params params = {
-        /*.mem_size   =*/ buf_compute.size,
-        /*.mem_buffer =*/ buf_compute.data,
-        /*.no_alloc   =*/ true,
-    };
-
-    struct ggml_context * ctx0 = ggml_init(params);
-
-    ggml_cgraph * gf = ggml_new_graph(ctx0);
-
-    struct ggml_tensor * cur;
     struct ggml_tensor * inpL;
 
     if (batch.token) {
-        struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
+        struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, batch.n_tokens);
+        cb(inp_tokens, "inp_tokens", -1);
 
-        ggml_allocr_alloc(lctx.alloc, inp_tokens);
-        if (!ggml_allocr_is_measure(lctx.alloc)) {
-            memcpy(inp_tokens->data, batch.token, n_tokens*ggml_element_size(inp_tokens));
-        }
-        ggml_set_name(inp_tokens, "inp_tokens");
-
-        inpL = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens);
+        inpL = ggml_get_rows(ctx, tok_embd, inp_tokens);
     } else {
 #ifdef GGML_USE_MPI
         GGML_ASSERT(false && "not implemented");
 #endif
 
-        inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens);
-
-        ggml_allocr_alloc(lctx.alloc, inpL);
-        if (!ggml_allocr_is_measure(lctx.alloc)) {
-            memcpy(inpL->data, batch.embd, n_tokens * n_embd * ggml_element_size(inpL));
-        }
+        inpL = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, batch.n_tokens);
     }
 
-    const int i_gpu_start = n_layer - n_gpu_layers;
-    (void) i_gpu_start;
+    return inpL;
+}
 
-    // offload functions set the tensor output backend to GPU
-    // tensors are GPU-accelerated if any input or the output has been offloaded
-    offload_func_t offload_func_nr = llama_nop; // nr = non-repeating
-    offload_func_t offload_func_kq = llama_nop;
-    offload_func_t offload_func_v  = llama_nop;
+// Persimmon: n_rot = n_embd_head/2
+// Other:     n_rot = n_embd_head
+static void llm_build_k_shift(
+      struct ggml_context * ctx,
+      const llama_hparams & hparams,
+      const llama_cparams & cparams,
+     const llama_kv_cache & kv,
+       struct ggml_cgraph * graph,
+            llm_rope_type   type,
+                  int64_t   n_ctx,
+                  int64_t   n_rot,
+                  float     freq_base,
+                  float     freq_scale,
+       const llm_build_cb & cb) {
+    const int64_t n_layer     = hparams.n_layer;
+    const int64_t n_head_kv   = hparams.n_head_kv;
+    const int64_t n_embd_gqa  = hparams.n_embd_gqa();
+    const int64_t n_embd_head = hparams.n_embd_head();
+    const int32_t n_orig_ctx  = cparams.n_yarn_orig_ctx;
+    const float   ext_factor  = cparams.yarn_ext_factor;
+    const float   attn_factor = cparams.yarn_attn_factor;
+    const float   beta_fast   = cparams.yarn_beta_fast;
+    const float   beta_slow   = cparams.yarn_beta_slow;
 
-#ifdef GGML_USE_CUBLAS
-    if (n_gpu_layers > n_layer) {
-        offload_func_nr = ggml_cuda_assign_buffers_no_alloc;
-    }
-    if (n_gpu_layers > n_layer + 1) {
-        offload_func_v  = ggml_cuda_assign_buffers_no_alloc;
-    }
-    if (n_gpu_layers > n_layer + 2) {
-        offload_func_kq = ggml_cuda_assign_buffers_no_alloc;
-    }
-#endif // GGML_USE_CUBLAS
+    GGML_ASSERT(n_embd_head % n_rot == 0);
 
-    // KQ_scale
-    struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
-    ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
-    ggml_allocr_alloc(lctx.alloc, KQ_scale);
-    if (!ggml_allocr_is_measure(lctx.alloc)) {
-        ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd_head)));
-    }
+    struct ggml_tensor * K_shift = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, n_ctx);
+    cb(K_shift, "K_shift", -1);
 
-    // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
-    struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
-    offload_func_kq(KQ_mask);
-    ggml_set_name(KQ_mask, "KQ_mask");
-    ggml_allocr_alloc(lctx.alloc, KQ_mask);
-    if (!ggml_allocr_is_measure(lctx.alloc)) {
-        float * data = (float *) KQ_mask->data;
-        memset(data, 0, ggml_nbytes(KQ_mask));
+    int rope_type = 0;
 
-        for (int h = 0; h < 1; ++h) {
-            for (int j = 0; j < n_tokens; ++j) {
-                const llama_pos    pos    = batch.pos[j];
-                const llama_seq_id seq_id = batch.seq_id[j];
-
-                for (int i = 0; i < n_kv; ++i) {
-                    if (!kv_self.cells[i].has_seq_id(seq_id) || kv_self.cells[i].pos > pos) {
-                        data[h*(n_kv*n_tokens) + j*n_kv + i] = -INFINITY;
-                    }
-                }
-            }
-        }
-    }
-
-    // KQ_pos - contains the positions
-    struct ggml_tensor * KQ_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
-    offload_func_kq(KQ_pos);
-    ggml_set_name(KQ_pos, "KQ_pos");
-    ggml_allocr_alloc(lctx.alloc, KQ_pos);
-    if (!ggml_allocr_is_measure(lctx.alloc)) {
-        int * data = (int *) KQ_pos->data;
-        for (int i = 0; i < n_tokens; ++i) {
-            data[i] = batch.pos[i];
-        }
-    }
-
-    // shift the entire K-cache if needed
-    if (do_rope_shift) {
-        struct ggml_tensor * K_shift = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_ctx);
-        offload_func_kq(K_shift);
-        ggml_set_name(K_shift, "K_shift");
-        ggml_allocr_alloc(lctx.alloc, K_shift);
-        if (!ggml_allocr_is_measure(lctx.alloc)) {
-            int * data = (int *) K_shift->data;
-            for (int i = 0; i < n_ctx; ++i) {
-                data[i] = kv_self.cells[i].delta;
-            }
-        }
-
-        for (int il = 0; il < n_layer; ++il) {
-            struct ggml_tensor * tmp =
-                    ggml_rope_custom_inplace(ctx0,
-                        ggml_view_3d(ctx0, kv_self.k,
-                            n_embd_head, n_head_kv, n_ctx,
-                            ggml_element_size(kv_self.k)*n_embd_head,
-                            ggml_element_size(kv_self.k)*n_embd_gqa,
-                            ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il),
-                        K_shift, n_embd_head, 0, 0, freq_base, freq_scale);
-            offload_func_kq(tmp);
-            ggml_build_forward_expand(gf, tmp);
-        }
+    switch (type) {
+        case LLM_ROPE:      rope_type = 0; break;
+        case LLM_ROPE_NEOX: rope_type = 2; break;
+        case LLM_ROPE_GLM:  rope_type = 4; break;
     }
 
     for (int il = 0; il < n_layer; ++il) {
-        ggml_format_name(inpL, "layer_inp_%d", il);
+        struct ggml_tensor * tmp =
+            // we rotate only the first n_rot dimensions
+            ggml_rope_custom_inplace(ctx,
+                    ggml_view_3d(ctx, kv.k,
+                        n_rot, n_head_kv, n_ctx,
+                        ggml_element_size(kv.k)*n_embd_head,
+                        ggml_element_size(kv.k)*n_embd_gqa,
+                        ggml_element_size(kv.k)*n_embd_gqa*n_ctx*il),
+                    K_shift, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
+                    ext_factor, attn_factor, beta_fast, beta_slow);
+        cb(tmp, "K_shifted", il);
+        ggml_build_forward_expand(graph, tmp);
+    }
+}
 
-        offload_func_t offload_func = llama_nop;
+static void llm_build_kv_store(
+        struct ggml_context * ctx,
+        const llama_hparams & hparams,
+       const llama_kv_cache & kv,
+         struct ggml_cgraph * graph,
+         struct ggml_tensor * k_cur,
+         struct ggml_tensor * v_cur,
+                    int64_t   n_ctx,
+                    int32_t   n_tokens,
+                    int32_t   kv_head,
+         const llm_build_cb & cb,
+                    int64_t   il) {
+    const int64_t n_embd_gqa = hparams.n_embd_gqa();
 
-#ifdef GGML_USE_CUBLAS
-        if (il >= i_gpu_start) {
-            offload_func = ggml_cuda_assign_buffers_no_alloc;
+    // compute the transposed [n_tokens, n_embd] V matrix
+    struct ggml_tensor * v_cur_t = ggml_transpose(ctx, ggml_reshape_2d(ctx, v_cur, n_embd_gqa, n_tokens));
+    //struct ggml_tensor * v_cur_t = ggml_transpose(ctx, v_cur); // TODO: reshape above is likely not needed
+    cb(v_cur_t, "v_cur_t", il);
+
+    struct ggml_tensor * k_cache_view = ggml_view_1d(ctx, kv.k, n_tokens*n_embd_gqa,
+            (ggml_element_size(kv.k)*n_embd_gqa)*(il*n_ctx + kv_head));
+    cb(k_cache_view, "k_cache_view", il);
+
+    struct ggml_tensor * v_cache_view = ggml_view_2d(ctx, kv.v, n_tokens, n_embd_gqa,
+            (   n_ctx)*ggml_element_size(kv.v),
+            (il*n_ctx)*ggml_element_size(kv.v)*n_embd_gqa + kv_head*ggml_element_size(kv.v));
+    cb(v_cache_view, "v_cache_view", il);
+
+    // important: storing RoPE-ed version of K in the KV cache!
+    ggml_build_forward_expand(graph, ggml_cpy(ctx, k_cur,   k_cache_view));
+    ggml_build_forward_expand(graph, ggml_cpy(ctx, v_cur_t, v_cache_view));
+}
+
+static struct ggml_tensor * llm_build_norm(
+        struct ggml_context * ctx,
+         struct ggml_tensor * cur,
+        const llama_hparams & hparams,
+         struct ggml_tensor * mw,
+         struct ggml_tensor * mb,
+              llm_norm_type   type,
+         const llm_build_cb & cb,
+                        int   il) {
+    switch (type) {
+        case LLM_NORM:     cur = ggml_norm    (ctx, cur, hparams.f_norm_eps);     break;
+        case LLM_NORM_RMS: cur = ggml_rms_norm(ctx, cur, hparams.f_norm_rms_eps); break;
+    }
+
+    if (mw || mb) {
+        cb(cur, "norm", il);
+    }
+
+    if (mw) {
+        cur = ggml_mul(ctx, cur, mw);
+        if (mb) {
+            cb(cur, "norm_w", il);
         }
-#endif // GGML_USE_CUBLAS
+    }
 
-        struct ggml_tensor * inpSA = inpL;
+    if (mb) {
+        cur = ggml_add(ctx, cur, mb);
+    }
 
-        // norm
-        {
-            cur = ggml_rms_norm(ctx0, inpL, norm_rms_eps);
-            offload_func(cur);
-            ggml_set_name(cur, "rms_norm_0");
+    return cur;
+}
 
-            // cur = cur*attn_norm(broadcasted)
-            cur = ggml_mul(ctx0, cur, model.layers[il].attn_norm);
-            offload_func(cur);
-            ggml_set_name(cur, "attention_norm_0");
+static struct ggml_tensor * llm_build_ffn(
+        struct ggml_context * ctx,
+         struct ggml_tensor * cur,
+         struct ggml_tensor * up,
+         struct ggml_tensor * up_b,
+         struct ggml_tensor * gate,
+         struct ggml_tensor * gate_b,
+         struct ggml_tensor * down,
+         struct ggml_tensor * down_b,
+            llm_ffn_op_type   type_op,
+          llm_ffn_gate_type   type_gate,
+         const llm_build_cb & cb,
+                        int   il) {
+    struct ggml_tensor * tmp = ggml_mul_mat(ctx, up, cur);
+    cb(tmp, "ffn_up", il);
+
+    if (up_b) {
+        tmp = ggml_add(ctx, tmp, up_b);
+        cb(tmp, "ffn_up_b", il);
+    }
+
+    if (gate) {
+        switch (type_gate) {
+            case LLM_FFN_SEQ:
+                {
+                    cur = ggml_mul_mat(ctx, gate, tmp);
+                    cb(cur, "ffn_gate", il);
+                } break;
+            case LLM_FFN_PAR:
+                {
+                    cur = ggml_mul_mat(ctx, gate, cur);
+                    cb(cur, "ffn_gate", il);
+                } break;
         }
 
-        // self-attention
-        {
-            // compute Q and K and RoPE them
-            struct ggml_tensor * tmpk = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
-            offload_func_kq(tmpk);
-            ggml_set_name(tmpk, "tmpk");
+        if (gate_b) {
+            cur = ggml_add(ctx, cur, gate_b);
+            cb(cur, "ffn_gate_b", il);
+        }
+    } else {
+        cur = tmp;
+    }
 
-            struct ggml_tensor * tmpq = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
-            offload_func_kq(tmpq);
-            ggml_set_name(tmpq, "tmpq");
-
-            struct ggml_tensor * Kcur = ggml_rope_custom(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, n_tokens), KQ_pos, n_embd_head, 0, 0, freq_base, freq_scale);
-            offload_func_kq(Kcur);
-            ggml_set_name(Kcur, "Kcur");
-
-            struct ggml_tensor * Qcur = ggml_rope_custom(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head,    n_tokens), KQ_pos, n_embd_head, 0, 0, freq_base, freq_scale);
-            offload_func_kq(Qcur);
-            ggml_set_name(Qcur, "Qcur");
-
-            // store key and value to memory
+    switch (type_op) {
+        case LLM_FFN_SILU:
             {
-                // compute the transposed [n_tokens, n_embd] V matrix
+                cur = ggml_silu(ctx, cur);
+                cb(cur, "ffn_silu", il);
+            } break;
+        case LLM_FFN_GELU:
+            {
+                cur = ggml_gelu(ctx, cur);
+                cb(cur, "ffn_gelu", il);
+            } break;
+        case LLM_FFN_RELU:
+            {
+                cur = ggml_relu(ctx, cur);
+                cb(cur, "ffn_relu", il);
+            } break;
+        case LLM_FFN_RELU_SQR:
+            {
+                cur = ggml_relu(ctx, cur);
+                cb(cur, "ffn_relu", il);
 
-                struct ggml_tensor * tmpv = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
-                offload_func_v(tmpv);
-                ggml_set_name(tmpv, "tmpv");
+                cur = ggml_sqr(ctx, cur);
+                cb(cur, "ffn_sqr(relu)", il);
+            } break;
+    }
 
-                struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, tmpv, n_embd_gqa, n_tokens));
-                offload_func_v(Vcur);
-                ggml_set_name(Vcur, "Vcur");
+    if (type_gate == LLM_FFN_PAR) {
+        cur = ggml_mul(ctx, cur, tmp);
+        cb(cur, "ffn_gate_par", il);
+    }
 
-                struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, n_tokens*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_head));
-                offload_func_kq(k);
-                ggml_set_name(k, "k");
+    cur = ggml_mul_mat(ctx, down, cur);
+    if (down_b) {
+        cb(cur, "ffn_down", il);
+    }
 
-                struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, n_tokens, n_embd_gqa,
-                        (   n_ctx)*ggml_element_size(kv_self.v),
-                        (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*ggml_element_size(kv_self.v));
-                offload_func_v(v);
-                ggml_set_name(v, "v");
+    if (down_b) {
+        cur = ggml_add(ctx, cur, down_b);
+    }
 
-                // important: storing RoPE-ed version of K in the KV cache!
-                ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
-                ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
-            }
+    return cur;
+}
 
-            struct ggml_tensor * Q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3);
-            offload_func_kq(Q);
-            ggml_set_name(Q, "Q");
+// if max_alibi_bias > 0 then apply ALiBi
+static struct ggml_tensor * llm_build_kqv(
+        struct ggml_context * ctx,
+        const llama_hparams & hparams,
+       const llama_kv_cache & kv,
+         struct ggml_tensor * wo,
+         struct ggml_tensor * wo_b,
+         struct ggml_tensor * q_cur,
+         struct ggml_tensor * kq_scale,
+         struct ggml_tensor * kq_mask,
+                    int64_t   n_ctx,
+                    int32_t   n_tokens,
+                    int32_t   n_kv,
+                    float     max_alibi_bias,
+         const llm_build_cb & cb,
+                    int       il) {
+    const int64_t n_embd      = hparams.n_embd;
+    const int64_t n_head      = hparams.n_head;
+    const int64_t n_head_kv   = hparams.n_head_kv;
+    const int64_t n_embd_head = hparams.n_embd_head();
+    const int64_t n_embd_gqa  = hparams.n_embd_gqa();
 
-            struct ggml_tensor * K =
-                ggml_view_3d(ctx0, kv_self.k,
-                        n_embd_head, n_kv, n_head_kv,
-                        ggml_element_size(kv_self.k)*n_embd_gqa,
-                        ggml_element_size(kv_self.k)*n_embd_head,
-                        ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il);
-            offload_func_kq(K);
-            ggml_set_name(K, "K");
+    struct ggml_tensor * q = ggml_permute(ctx, q_cur, 0, 2, 1, 3);
+    cb(q, "q", il);
 
-            // K * Q
-            struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
-            offload_func_kq(KQ);
-            ggml_set_name(KQ, "KQ");
+    struct ggml_tensor * k =
+        ggml_view_3d(ctx, kv.k,
+                n_embd_head, n_kv, n_head_kv,
+                ggml_element_size(kv.k)*n_embd_gqa,
+                ggml_element_size(kv.k)*n_embd_head,
+                ggml_element_size(kv.k)*n_embd_gqa*n_ctx*il);
+    cb(k, "k", il);
 
-            // KQ_scaled = KQ / sqrt(n_embd_head)
-            // KQ_scaled shape [n_kv, n_tokens, n_head, 1]
-            struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, KQ_scale);
-            offload_func_kq(KQ_scaled);
-            ggml_set_name(KQ_scaled, "KQ_scaled");
+    struct ggml_tensor * kq = ggml_mul_mat(ctx, k, q);
+    cb(kq, "kq", il);
 
-            // KQ_masked = mask_past(KQ_scaled)
-            struct ggml_tensor * KQ_masked = ggml_add(ctx0, KQ_scaled, KQ_mask);
-            offload_func_kq(KQ_masked);
-            ggml_set_name(KQ_masked, "KQ_masked");
+    kq = ggml_scale(ctx, kq, kq_scale);
+    cb(kq, "kq_scaled", il);
 
-            // KQ = soft_max(KQ_masked)
-            struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
-            offload_func_v(KQ_soft_max);
-            ggml_set_name(KQ_soft_max, "KQ_soft_max");
+    if (max_alibi_bias > 0.0f) {
+        // TODO: n_head or n_head_kv
+        // TODO: K-shift is likely not working
+        // TODO: change to ggml_add
+        kq = ggml_alibi(ctx, kq, /*n_past*/ 0, n_head, max_alibi_bias);
+        cb(kq, "kq_scaled_alibi", il);
+    }
 
-            // split cached V into n_head heads
-            struct ggml_tensor * V =
-                ggml_view_3d(ctx0, kv_self.v,
-                        n_kv, n_embd_head, n_head_kv,
-                        ggml_element_size(kv_self.v)*n_ctx,
-                        ggml_element_size(kv_self.v)*n_ctx*n_embd_head,
-                        ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il);
-            offload_func_v(V);
-            ggml_set_name(V, "V");
+    kq = ggml_add(ctx, kq, kq_mask);
+    cb(kq, "kq_masked", il);
 
-#if 1
-            struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
-            offload_func_v(KQV);
-            ggml_set_name(KQV, "KQV");
-#else
-            // make V contiguous in memory to speed up the matmul, however we waste time on the copy
-            // on M1 this is faster for the perplexity computation, but ~5% slower for the single-token generation
-            // is there a better way?
-            struct ggml_tensor * V_cont = ggml_cpy(ctx0, V, ggml_new_tensor_3d(ctx0, kv_self.v->type, n_ctx, n_embd_head, n_head));
-            struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_cont, KQ_soft_max);
-#endif
+    kq = ggml_soft_max(ctx, kq);
+    cb(kq, "kq_soft_max", il);
 
-            // KQV_merged = KQV.permute(0, 2, 1, 3)
-            struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
-            offload_func_v(KQV_merged);
-            ggml_set_name(KQV_merged, "KQV_merged");
+    // split cached v into n_head heads
+    struct ggml_tensor * v =
+        ggml_view_3d(ctx, kv.v,
+                n_kv, n_embd_head, n_head_kv,
+                ggml_element_size(kv.v)*n_ctx,
+                ggml_element_size(kv.v)*n_ctx*n_embd_head,
+                ggml_element_size(kv.v)*n_ctx*n_embd_gqa*il);
+    cb(v, "v", il);
 
-            // cur = KQV_merged.contiguous().view(n_embd, n_tokens)
-            cur = ggml_cont_2d(ctx0, KQV_merged, n_embd, n_tokens);
-            offload_func_v(cur);
-            ggml_set_name(cur, "KQV_merged_contiguous");
+    struct ggml_tensor * kqv = ggml_mul_mat(ctx, v, kq);
+    cb(kqv, "kqv", il);
 
-            // projection (no bias)
-            cur = ggml_mul_mat(ctx0,
-                    model.layers[il].wo,
-                    cur);
-            offload_func(cur);
-            ggml_set_name(cur, "result_wo");
+    struct ggml_tensor * kqv_merged = ggml_permute(ctx, kqv, 0, 2, 1, 3);
+    cb(kqv_merged, "kqv_merged", il);
+
+    struct ggml_tensor * cur = ggml_cont_2d(ctx, kqv_merged, n_embd, n_tokens);
+    cb(cur, "kqv_merged_cont", il);
+
+    cur = ggml_mul_mat(ctx, wo, cur);
+    if (wo_b) {
+        cb(cur, "kqv_wo", il);
+    }
+
+    if (wo_b) {
+        cur = ggml_add(ctx, cur, wo_b);
+    }
+
+    return cur;
+}
+
+struct llm_build_context {
+    const llama_model    & model;
+    const llama_hparams  & hparams;
+    const llama_cparams  & cparams;
+    const llama_batch    & batch;
+    const llama_kv_cache & kv_self;
+
+    const int64_t n_embd;
+    const int64_t n_layer;
+    const int64_t n_ctx;       // user-specified context size (can be different from n_ctx_train)
+    const int64_t n_head;
+    const int64_t n_head_kv;
+    const int64_t n_embd_head;
+    const int64_t n_embd_gqa;
+
+    const float freq_base;
+    const float freq_scale;
+    const float ext_factor;
+    const float attn_factor;
+    const float beta_fast;
+    const float beta_slow;
+    const float norm_eps;
+    const float norm_rms_eps;
+
+    const int32_t n_tokens;
+    const int32_t n_kv;     // size of KV cache to consider (n_kv <= n_ctx)
+    const int32_t kv_head;  // index of where we store new KV data in the cache
+    const int32_t n_orig_ctx;
+
+    const bool do_rope_shift;
+
+    const llm_build_cb & cb;
+
+    llama_buffer & buf_compute;
+
+    struct ggml_context * ctx0 = nullptr;
+
+    // TODO: consider making the entire interface noexcept
+    llm_build_context(
+        llama_context  & lctx,
+    const llama_batch  & batch,
+    const llm_build_cb & cb,
+                  bool   worst_case) :
+        model         (lctx.model),
+        hparams       (model.hparams),
+        cparams       (lctx.cparams),
+        batch         (batch),
+        kv_self       (lctx.kv_self),
+        n_embd        (hparams.n_embd),
+        n_layer       (hparams.n_layer),
+        n_ctx         (cparams.n_ctx),
+        n_head        (hparams.n_head),
+        n_head_kv     (hparams.n_head_kv),
+        n_embd_head   (hparams.n_embd_head()),
+        n_embd_gqa    (hparams.n_embd_gqa()),
+        freq_base     (cparams.rope_freq_base),
+        freq_scale    (cparams.rope_freq_scale),
+        ext_factor    (cparams.yarn_ext_factor),
+        attn_factor   (cparams.yarn_attn_factor),
+        beta_fast     (cparams.yarn_beta_fast),
+        beta_slow     (cparams.yarn_beta_slow),
+        norm_eps      (hparams.f_norm_eps),
+        norm_rms_eps  (hparams.f_norm_rms_eps),
+        n_tokens      (batch.n_tokens),
+        n_kv          (worst_case ? n_ctx            : kv_self.n),
+        kv_head       (worst_case ? n_ctx - n_tokens : kv_self.head),
+        n_orig_ctx    (cparams.n_yarn_orig_ctx),
+        do_rope_shift (worst_case || kv_self.has_shift),
+        cb            (cb),
+        buf_compute   (lctx.buf_compute) {
+            GGML_ASSERT(!!kv_self.ctx);
+
+            // all initializations should be done in init()
         }
 
-        struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpSA);
-        offload_func(inpFF);
-        ggml_set_name(inpFF, "inpFF");
+    void init() {
+        struct ggml_init_params params = {
+            /*.mem_size   =*/ buf_compute.size,
+            /*.mem_buffer =*/ buf_compute.data,
+            /*.no_alloc   =*/ true,
+        };
+
+        ctx0 = ggml_init(params);
+    }
+
+    void free() {
+        if (ctx0) {
+            ggml_free(ctx0);
+            ctx0 = nullptr;
+        }
+    }
+
+    struct ggml_cgraph * build_llama() {
+        struct ggml_cgraph * gf = ggml_new_graph(ctx0);
+
+        GGML_ASSERT(n_embd_head == hparams.n_rot);
+
+        struct ggml_tensor * cur;
+        struct ggml_tensor * inpL;
+
+        inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb);
+        cb(inpL, "inp_embd", -1);
+
+        // inp_pos - contains the positions
+        struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
+        cb(inp_pos, "inp_pos", -1);
+
+        // KQ_scale
+        struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
+        cb(KQ_scale, "KQ_scale", -1);
+
+        // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
+        struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
+        cb(KQ_mask, "KQ_mask", -1);
+
+        // shift the entire K-cache if needed
+        if (do_rope_shift) {
+            llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE, n_ctx, n_embd_head, freq_base, freq_scale, cb);
+        }
+
+        for (int il = 0; il < n_layer; ++il) {
+            struct ggml_tensor * inpSA = inpL;
 
-        // feed-forward network
-        {
             // norm
+            cur = llm_build_norm(ctx0, inpL, hparams,
+                    model.layers[il].attn_norm, NULL,
+                    LLM_NORM_RMS, cb, il);
+            cb(cur, "attn_norm", il);
+
+            // self-attention
             {
-                cur = ggml_rms_norm(ctx0, inpFF, norm_rms_eps);
-                offload_func(cur);
-                ggml_set_name(cur, "rms_norm_1");
+                // compute Q and K and RoPE them
+                struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
+                cb(Qcur, "Qcur", il);
 
-                // cur = cur*ffn_norm(broadcasted)
-                cur = ggml_mul(ctx0, cur, model.layers[il].ffn_norm);
-                offload_func(cur);
-                ggml_set_name(cur, "ffn_norm");
-            }
+                struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
+                cb(Kcur, "Kcur", il);
 
-            struct ggml_tensor * tmp = ggml_mul_mat(ctx0,
-                    model.layers[il].w3,
-                    cur);
-            offload_func(tmp);
-            ggml_set_name(tmp, "result_w3");
+                struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
+                cb(Vcur, "Vcur", il);
 
-            cur = ggml_mul_mat(ctx0,
-                    model.layers[il].w1,
-                    cur);
-            offload_func(cur);
-            ggml_set_name(cur, "result_w1");
-
-            // SILU activation
-            cur = ggml_silu(ctx0, cur);
-            offload_func(cur);
-            ggml_set_name(cur, "silu");
-
-            cur = ggml_mul(ctx0, cur, tmp);
-            offload_func(cur);
-            ggml_set_name(cur, "silu_x_result_w3");
-
-            cur = ggml_mul_mat(ctx0,
-                    model.layers[il].w2,
-                    cur);
-            offload_func(cur);
-            ggml_set_name(cur, "result_w2");
-        }
-
-        cur = ggml_add(ctx0, cur, inpFF);
-        offload_func(cur);
-        ggml_set_name(cur, "inpFF_+_result_w2");
-
-        // input for next layer
-        inpL = cur;
-    }
-
-    cur = inpL;
-
-    // norm
-    {
-        cur = ggml_rms_norm(ctx0, cur, norm_rms_eps);
-        offload_func_nr(cur);
-        ggml_set_name(cur, "rms_norm_2");
-
-        // cur = cur*norm(broadcasted)
-        cur = ggml_mul(ctx0, cur, model.output_norm);
-        // offload_func_nr(cur); // TODO CPU + GPU mirrored backend
-        ggml_set_name(cur, "result_norm");
-    }
-
-    // lm_head
-    cur = ggml_mul_mat(ctx0, model.output, cur);
-    ggml_set_name(cur, "result_output");
-
-    ggml_build_forward_expand(gf, cur);
-
-    ggml_free(ctx0);
-
-    return gf;
-}
-
-static struct ggml_cgraph * llm_build_baichaun(
-         llama_context & lctx,
-     const llama_batch & batch) {
-    const auto & model   = lctx.model;
-    const auto & hparams = model.hparams;
-    const auto & cparams = lctx.cparams;
-
-    const auto & kv_self = lctx.kv_self;
-
-    GGML_ASSERT(!!kv_self.ctx);
-
-    const int64_t n_embd      = hparams.n_embd;
-    const int64_t n_layer     = hparams.n_layer;
-    const int64_t n_ctx       = cparams.n_ctx;
-    const int64_t n_head      = hparams.n_head;
-    const int64_t n_head_kv   = hparams.n_head_kv;
-    const int64_t n_embd_head = hparams.n_embd_head();
-    const int64_t n_embd_gqa  = hparams.n_embd_gqa();
-
-    GGML_ASSERT(n_embd_head == hparams.n_rot);
-
-    const float freq_base    = cparams.rope_freq_base;
-    const float freq_scale   = cparams.rope_freq_scale;
-    const float norm_rms_eps = hparams.f_norm_rms_eps;
-
-    const int n_gpu_layers = model.n_gpu_layers;
-
-    const int32_t n_tokens = batch.n_tokens;
-    const int32_t n_kv     = ggml_allocr_is_measure(lctx.alloc) ? n_ctx            : kv_self.n;
-    const int32_t kv_head  = ggml_allocr_is_measure(lctx.alloc) ? n_ctx - n_tokens : kv_self.head;
-
-    const bool do_rope_shift = ggml_allocr_is_measure(lctx.alloc) || kv_self.has_shift;
-
-    auto & buf_compute = lctx.buf_compute;
-
-    struct ggml_init_params params = {
-        /*.mem_size   =*/ buf_compute.size,
-        /*.mem_buffer =*/ buf_compute.data,
-        /*.no_alloc   =*/ true,
-    };
-
-    struct ggml_context * ctx0 = ggml_init(params);
-
-    ggml_cgraph * gf = ggml_new_graph(ctx0);
-
-    struct ggml_tensor * cur;
-    struct ggml_tensor * inpL;
-
-    if (batch.token) {
-        struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
-
-        ggml_allocr_alloc(lctx.alloc, inp_tokens);
-        if (!ggml_allocr_is_measure(lctx.alloc)) {
-            memcpy(inp_tokens->data, batch.token, n_tokens*ggml_element_size(inp_tokens));
-        }
-        ggml_set_name(inp_tokens, "inp_tokens");
-
-        inpL = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens);
-    } else {
-#ifdef GGML_USE_MPI
-        GGML_ASSERT(false && "not implemented");
-#endif
-
-        inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens);
-
-        ggml_allocr_alloc(lctx.alloc, inpL);
-        if (!ggml_allocr_is_measure(lctx.alloc)) {
-            memcpy(inpL->data, batch.embd, n_tokens * n_embd * ggml_element_size(inpL));
-        }
-    }
-
-    const int i_gpu_start = n_layer - n_gpu_layers;
-    (void) i_gpu_start;
-
-    // offload functions set the tensor output backend to GPU
-    // tensors are GPU-accelerated if any input or the output has been offloaded
-    offload_func_t offload_func_nr = llama_nop; // nr = non-repeating
-    offload_func_t offload_func_kq = llama_nop;
-    offload_func_t offload_func_v  = llama_nop;
-
-#ifdef GGML_USE_CUBLAS
-    if (n_gpu_layers > n_layer) {
-        offload_func_nr = ggml_cuda_assign_buffers_no_alloc;
-    }
-    if (n_gpu_layers > n_layer + 1) {
-        offload_func_v  = ggml_cuda_assign_buffers_no_alloc;
-    }
-    if (n_gpu_layers > n_layer + 2) {
-        offload_func_kq = ggml_cuda_assign_buffers_no_alloc;
-    }
-#endif // GGML_USE_CUBLAS
-
-    // KQ_scale
-    struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
-    ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
-    ggml_allocr_alloc(lctx.alloc, KQ_scale);
-    if (!ggml_allocr_is_measure(lctx.alloc)) {
-        ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head));
-    }
-
-    // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
-    struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
-    offload_func_kq(KQ_mask);
-    ggml_set_name(KQ_mask, "KQ_mask");
-    ggml_allocr_alloc(lctx.alloc, KQ_mask);
-    if (!ggml_allocr_is_measure(lctx.alloc)) {
-        float * data = (float *) KQ_mask->data;
-        memset(data, 0, ggml_nbytes(KQ_mask));
-
-        for (int h = 0; h < 1; ++h) {
-            for (int j = 0; j < n_tokens; ++j) {
-                const llama_pos    pos    = batch.pos[j];
-                const llama_seq_id seq_id = batch.seq_id[j];
-
-                for (int i = 0; i < n_kv; ++i) {
-                    if (!kv_self.cells[i].has_seq_id(seq_id) || kv_self.cells[i].pos > pos) {
-                        data[h*(n_kv*n_tokens) + j*n_kv + i] = -INFINITY;
-                    }
-                }
-            }
-        }
-    }
-
-    // KQ_pos - contains the positions
-    struct ggml_tensor * KQ_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
-    offload_func_kq(KQ_pos);
-    ggml_set_name(KQ_pos, "KQ_pos");
-    ggml_allocr_alloc(lctx.alloc, KQ_pos);
-    if (!ggml_allocr_is_measure(lctx.alloc)) {
-        int * data = (int *) KQ_pos->data;
-        for (int i = 0; i < n_tokens; ++i) {
-            data[i] = batch.pos[i];
-        }
-    }
-
-    // shift the entire K-cache if needed
-    if (do_rope_shift) {
-        struct ggml_tensor * K_shift = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_ctx);
-        offload_func_kq(K_shift);
-        ggml_set_name(K_shift, "K_shift");
-        ggml_allocr_alloc(lctx.alloc, K_shift);
-        if (!ggml_allocr_is_measure(lctx.alloc)) {
-            int * data = (int *) K_shift->data;
-            for (int i = 0; i < n_ctx; ++i) {
-                data[i] = kv_self.cells[i].delta;
-            }
-        }
-
-        for (int il = 0; il < n_layer; ++il) {
-            struct ggml_tensor * tmp =
-                    ggml_rope_custom_inplace(ctx0,
-                        ggml_view_3d(ctx0, kv_self.k,
-                            n_embd_head, n_head_kv, n_ctx,
-                            ggml_element_size(kv_self.k)*n_embd_head,
-                            ggml_element_size(kv_self.k)*n_embd_gqa,
-                            ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il),
-                        K_shift, n_embd_head, 0, 0, freq_base, freq_scale);
-            offload_func_kq(tmp);
-            ggml_build_forward_expand(gf, tmp);
-        }
-    }
-
-    for (int il = 0; il < n_layer; ++il) {
-        ggml_format_name(inpL, "layer_inp_%d", il);
-
-        offload_func_t offload_func = llama_nop;
-
-#ifdef GGML_USE_CUBLAS
-        if (il >= i_gpu_start) {
-            offload_func = ggml_cuda_assign_buffers_no_alloc;
-        }
-#endif // GGML_USE_CUBLAS
-
-        struct ggml_tensor * inpSA = inpL;
-
-        // norm
-        {
-            cur = ggml_rms_norm(ctx0, inpL, norm_rms_eps);
-            offload_func(cur);
-            ggml_set_name(cur, "rms_norm_0");
-
-            // cur = cur*attn_norm(broadcasted)
-            cur = ggml_mul(ctx0, cur, model.layers[il].attn_norm);
-            offload_func(cur);
-            ggml_set_name(cur, "attention_norm_0");
-        }
-
-        // self-attention
-        {
-            // compute Q and K and RoPE them
-            struct ggml_tensor * tmpk = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
-            offload_func_kq(tmpk);
-            ggml_set_name(tmpk, "tmpk");
-
-            struct ggml_tensor * tmpq = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
-            offload_func_kq(tmpq);
-            ggml_set_name(tmpq, "tmpq");
-
-            struct ggml_tensor * Kcur;
-            struct ggml_tensor * Qcur;
-            switch (model.type) {
-                case MODEL_7B:
-                    Kcur = ggml_rope_custom(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, n_tokens), KQ_pos, n_embd_head, 0, 0, freq_base, freq_scale);
-                    Qcur = ggml_rope_custom(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, n_tokens),    KQ_pos, n_embd_head, 0, 0, freq_base, freq_scale);
-                    break;
-                case MODEL_13B:
-                    Kcur = ggml_reshape_3d(ctx0, tmpk, n_embd/n_head, n_head, n_tokens);
-                    Qcur = ggml_reshape_3d(ctx0, tmpq, n_embd/n_head, n_head, n_tokens);
-                    break;
-                default:
-                    GGML_ASSERT(false);
-            }
-
-            offload_func_kq(Kcur);
-            ggml_set_name(Kcur, "Kcur");
-
-            offload_func_kq(Qcur);
-            ggml_set_name(Qcur, "Qcur");
-
-            // store key and value to memory
-            {
-                // compute the transposed [n_tokens, n_embd] V matrix
-
-                struct ggml_tensor * tmpv = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
-                offload_func_v(tmpv);
-                ggml_set_name(tmpv, "tmpv");
-
-                struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, tmpv, n_embd_gqa, n_tokens));
-                offload_func_v(Vcur);
-                ggml_set_name(Vcur, "Vcur");
-
-                struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, n_tokens*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_head));
-                offload_func_kq(k);
-                ggml_set_name(k, "k");
-
-                struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, n_tokens, n_embd_gqa,
-                        (   n_ctx)*ggml_element_size(kv_self.v),
-                        (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*ggml_element_size(kv_self.v));
-                offload_func_v(v);
-                ggml_set_name(v, "v");
-
-                // important: storing RoPE-ed version of K in the KV cache!
-                ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
-                ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
-            }
-
-            struct ggml_tensor * Q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3);
-            offload_func_kq(Q);
-            ggml_set_name(Q, "Q");
-
-            struct ggml_tensor * K =
-                ggml_view_3d(ctx0, kv_self.k,
-                        n_embd_head, n_kv, n_head_kv,
-                        ggml_element_size(kv_self.k)*n_embd_gqa,
-                        ggml_element_size(kv_self.k)*n_embd_head,
-                        ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il);
-            offload_func_kq(K);
-            ggml_set_name(K, "K");
-
-            // K * Q
-            struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
-            offload_func_kq(KQ);
-            ggml_set_name(KQ, "KQ");
-
-            // KQ_scaled = KQ / sqrt(n_embd_head)
-            // KQ_scaled shape [n_past + n_tokens, n_tokens, n_head, 1]
-            struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, KQ_scale);
-            offload_func_kq(KQ_scaled);
-            ggml_set_name(KQ_scaled, "KQ_scaled");
-
-            struct ggml_tensor * KQ_masked;
-            struct ggml_tensor * KQ_scaled_alibi;
-
-            switch (model.type) {
-                case MODEL_7B:
-                    KQ_masked = ggml_add(ctx0, KQ_scaled, KQ_mask);
-                    break;
-                case MODEL_13B:
-                    // TODO: replace with ggml_add()
-                    KQ_scaled_alibi = ggml_alibi(ctx0, KQ_scaled, /*n_past*/ 0, n_head, 8);
-                    ggml_set_name(KQ_scaled_alibi, "KQ_scaled_alibi");
-                    KQ_masked = ggml_add(ctx0, KQ_scaled_alibi, KQ_mask);
-                    break;
-                default:
-                    GGML_ASSERT(false);
-            }
-
-            // KQ = soft_max(KQ_masked)
-            struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
-            offload_func_v(KQ_soft_max);
-            ggml_set_name(KQ_soft_max, "KQ_soft_max");
-
-            // split cached V into n_head heads
-            struct ggml_tensor * V =
-                ggml_view_3d(ctx0, kv_self.v,
-                        n_kv, n_embd_head, n_head_kv,
-                        ggml_element_size(kv_self.v)*n_ctx,
-                        ggml_element_size(kv_self.v)*n_ctx*n_embd_head,
-                        ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il);
-            offload_func_v(V);
-            ggml_set_name(V, "V");
-
-            struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
-            offload_func_v(KQV);
-            ggml_set_name(KQV, "KQV");
-
-            // KQV_merged = KQV.permute(0, 2, 1, 3)
-            struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
-            offload_func_v(KQV_merged);
-            ggml_set_name(KQV_merged, "KQV_merged");
-
-            // cur = KQV_merged.contiguous().view(n_embd, n_tokens)
-            cur = ggml_cont_2d(ctx0, KQV_merged, n_embd, n_tokens);
-            offload_func_v(cur);
-            ggml_set_name(cur, "KQV_merged_contiguous");
-
-            // projection (no bias)
-            cur = ggml_mul_mat(ctx0,
-                    model.layers[il].wo,
-                    cur);
-            offload_func(cur);
-            ggml_set_name(cur, "result_wo");
-        }
-
-        struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpSA);
-        offload_func(inpFF);
-        ggml_set_name(inpFF, "inpFF");
-
-        // feed-forward network
-        {
-            // norm
-            {
-                cur = ggml_rms_norm(ctx0, inpFF, norm_rms_eps);
-                offload_func(cur);
-                ggml_set_name(cur, "rms_norm_1");
-
-                // cur = cur*ffn_norm(broadcasted)
-                cur = ggml_mul(ctx0, cur, model.layers[il].ffn_norm);
-                offload_func(cur);
-                ggml_set_name(cur, "ffn_norm");
-            }
-
-            struct ggml_tensor * tmp = ggml_mul_mat(ctx0,
-                    model.layers[il].w3,
-                    cur);
-            offload_func(tmp);
-            ggml_set_name(tmp, "result_w3");
-
-            cur = ggml_mul_mat(ctx0,
-                    model.layers[il].w1,
-                    cur);
-            offload_func(cur);
-            ggml_set_name(cur, "result_w1");
-
-            // SILU activation
-            cur = ggml_silu(ctx0, cur);
-            offload_func(cur);
-            ggml_set_name(cur, "silu");
-
-            cur = ggml_mul(ctx0, cur, tmp);
-            offload_func(cur);
-            ggml_set_name(cur, "silu_x_result_w3");
-
-            cur = ggml_mul_mat(ctx0,
-                    model.layers[il].w2,
-                    cur);
-            offload_func(cur);
-            ggml_set_name(cur, "result_w2");
-        }
-
-        cur = ggml_add(ctx0, cur, inpFF);
-        offload_func(cur);
-        ggml_set_name(cur, "inpFF_+_result_w2");
-
-        // input for next layer
-        inpL = cur;
-    }
-
-    cur = inpL;
-
-    // norm
-    {
-        cur = ggml_rms_norm(ctx0, cur, norm_rms_eps);
-        offload_func_nr(cur);
-        ggml_set_name(cur, "rms_norm_2");
-
-        // cur = cur*norm(broadcasted)
-        cur = ggml_mul(ctx0, cur, model.output_norm);
-        // offload_func_nr(cur); // TODO CPU + GPU mirrored backend
-        ggml_set_name(cur, "result_norm");
-    }
-
-    // lm_head
-    cur = ggml_mul_mat(ctx0, model.output, cur);
-    ggml_set_name(cur, "result_output");
-
-    ggml_build_forward_expand(gf, cur);
-
-    ggml_free(ctx0);
-
-    return gf;
-}
-
-static struct ggml_cgraph * llm_build_refact(
-         llama_context & lctx,
-     const llama_batch & batch) {
-    const auto & model   = lctx.model;
-    const auto & hparams = model.hparams;
-    const auto & cparams = lctx.cparams;
-
-    const auto & kv_self = lctx.kv_self;
-
-    GGML_ASSERT(!!kv_self.ctx);
-
-    const int64_t n_embd      = hparams.n_embd;
-    const int64_t n_layer     = hparams.n_layer;
-    const int64_t n_ctx       = cparams.n_ctx;
-    const int64_t n_head      = hparams.n_head;
-    const int64_t n_head_kv   = hparams.n_head_kv;
-    const int64_t n_embd_head = hparams.n_embd_head();
-    const int64_t n_embd_gqa  = hparams.n_embd_gqa();
-
-    const float norm_rms_eps = hparams.f_norm_rms_eps;
-
-    const int n_gpu_layers = model.n_gpu_layers;
-
-    const int32_t n_tokens = batch.n_tokens;
-    const int32_t n_kv     = ggml_allocr_is_measure(lctx.alloc) ? n_ctx            : kv_self.n;
-    const int32_t kv_head  = ggml_allocr_is_measure(lctx.alloc) ? n_ctx - n_tokens : kv_self.head;
-
-    // printf("n_kv = %d\n", n_kv);
-
-    auto & buf_compute = lctx.buf_compute;
-
-    struct ggml_init_params params = {
-        /*.mem_size   =*/ buf_compute.size,
-        /*.mem_buffer =*/ buf_compute.data,
-        /*.no_alloc   =*/ true,
-    };
-
-    struct ggml_context * ctx0 = ggml_init(params);
-
-    ggml_cgraph * gf = ggml_new_graph(ctx0);
-
-    struct ggml_tensor * cur;
-    struct ggml_tensor * inpL;
-
-    if (batch.token) {
-        struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
-
-        ggml_allocr_alloc(lctx.alloc, inp_tokens);
-        if (!ggml_allocr_is_measure(lctx.alloc)) {
-            memcpy(inp_tokens->data, batch.token, n_tokens*ggml_element_size(inp_tokens));
-        }
-        ggml_set_name(inp_tokens, "inp_tokens");
-
-        inpL = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens);
-    } else {
-#ifdef GGML_USE_MPI
-        GGML_ASSERT(false && "not implemented");
-#endif
-
-        inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens);
-
-        ggml_allocr_alloc(lctx.alloc, inpL);
-        if (!ggml_allocr_is_measure(lctx.alloc)) {
-            memcpy(inpL->data, batch.embd, n_tokens * n_embd * ggml_element_size(inpL));
-        }
-    }
-
-    const int i_gpu_start = n_layer - n_gpu_layers;
-    (void) i_gpu_start;
-
-    // offload functions set the tensor output backend to GPU
-    // tensors are GPU-accelerated if any input or the output has been offloaded
-    offload_func_t offload_func_nr = llama_nop; // nr = non-repeating
-    offload_func_t offload_func_kq = llama_nop;
-    offload_func_t offload_func_v  = llama_nop;
-
-#ifdef GGML_USE_CUBLAS
-    if (n_gpu_layers > n_layer) {
-        offload_func_nr = ggml_cuda_assign_buffers_no_alloc;
-    }
-    if (n_gpu_layers > n_layer + 1) {
-        offload_func_v  = ggml_cuda_assign_buffers_no_alloc;
-    }
-    if (n_gpu_layers > n_layer + 2) {
-        offload_func_kq = ggml_cuda_assign_buffers_no_alloc;
-    }
-#endif // GGML_USE_CUBLAS
-
-    // KQ_scale
-    struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
-    ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
-    ggml_allocr_alloc(lctx.alloc, KQ_scale);
-    if (!ggml_allocr_is_measure(lctx.alloc)) {
-        ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd_head)));
-    }
-
-    // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
-    struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
-    offload_func_kq(KQ_mask);
-    ggml_set_name(KQ_mask, "KQ_mask");
-    ggml_allocr_alloc(lctx.alloc, KQ_mask);
-    if (!ggml_allocr_is_measure(lctx.alloc)) {
-        float * data = (float *) KQ_mask->data;
-        memset(data, 0, ggml_nbytes(KQ_mask));
-
-        for (int h = 0; h < 1; ++h) {
-            for (int j = 0; j < n_tokens; ++j) {
-                const llama_pos    pos    = batch.pos[j];
-                const llama_seq_id seq_id = batch.seq_id[j];
-
-                for (int i = 0; i < n_kv; ++i) {
-                    if (!kv_self.cells[i].has_seq_id(seq_id) || kv_self.cells[i].pos > pos) {
-                        data[h*(n_kv*n_tokens) + j*n_kv + i] = -INFINITY;
-                    }
-                }
-            }
-        }
-    }
-
-    for (int il = 0; il < n_layer; ++il) {
-        ggml_format_name(inpL, "layer_inp_%d", il);
-
-        offload_func_t offload_func = llama_nop;
-
-#ifdef GGML_USE_CUBLAS
-        if (il >= i_gpu_start) {
-            offload_func = ggml_cuda_assign_buffers_no_alloc;
-        }
-#endif // GGML_USE_CUBLAS
-
-        struct ggml_tensor * inpSA = inpL;
-
-        // norm
-        {
-            cur = ggml_rms_norm(ctx0, inpL, norm_rms_eps);
-            offload_func(cur);
-            ggml_set_name(cur, "rms_norm_0");
-
-            // cur = cur*attn_norm(broadcasted)
-            cur = ggml_mul(ctx0, cur, model.layers[il].attn_norm);
-            offload_func(cur);
-            ggml_set_name(cur, "attention_norm_0");
-        }
-
-        // self-attention
-        {
-            // compute Q and K
-            struct ggml_tensor * tmpk = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
-            offload_func_kq(tmpk);
-            ggml_set_name(tmpk, "tmpk");
-
-            struct ggml_tensor * tmpq = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
-            offload_func_kq(tmpq);
-            ggml_set_name(tmpq, "tmpq");
-
-            struct ggml_tensor * Kcur = ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, n_tokens);
-            offload_func_kq(Kcur);
-            ggml_set_name(Kcur, "Kcur");
-
-            struct ggml_tensor * Qcur = ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head,    n_tokens);
-            offload_func_kq(Qcur);
-            ggml_set_name(Qcur, "Qcur");
-
-            // store key and value to memory
-            {
-                // compute the transposed [n_tokens, n_embd] V matrix
-
-                struct ggml_tensor * tmpv = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
-                offload_func_v(tmpv);
-                ggml_set_name(tmpv, "tmpv");
-
-                struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, tmpv, n_embd_gqa, n_tokens));
-                offload_func_v(Vcur);
-                ggml_set_name(Vcur, "Vcur");
-
-                struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, n_tokens*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_head));
-                offload_func_kq(k);
-                ggml_set_name(k, "k");
-
-                struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, n_tokens, n_embd_gqa,
-                        (   n_ctx)*ggml_element_size(kv_self.v),
-                        (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*ggml_element_size(kv_self.v));
-                offload_func_v(v);
-                ggml_set_name(v, "v");
-
-                ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
-                ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
-            }
-
-            struct ggml_tensor * Q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3);
-            offload_func_kq(Q);
-            ggml_set_name(Q, "Q");
-
-            struct ggml_tensor * K =
-                ggml_view_3d(ctx0, kv_self.k,
-                        n_embd_head, n_kv, n_head_kv,
-                        ggml_element_size(kv_self.k)*n_embd_gqa,
-                        ggml_element_size(kv_self.k)*n_embd_head,
-                        ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il);
-            offload_func_kq(K);
-            ggml_set_name(K, "K");
-
-            // K * Q
-            struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
-            offload_func_kq(KQ);
-            ggml_set_name(KQ, "KQ");
-
-            // KQ_scaled = KQ / sqrt(n_embd_head)
-            // KQ_scaled shape [n_kv, n_tokens, n_head, 1]
-            struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, KQ_scale);
-            offload_func_kq(KQ_scaled);
-            ggml_set_name(KQ_scaled, "KQ_scaled");
-
-            // KQ_masked = mask_past(KQ_scaled)
-            struct ggml_tensor * KQ_scaled_alibi = ggml_alibi(ctx0, KQ_scaled, /*n_past*/ 0, n_head, 8);
-            ggml_set_name(KQ_scaled_alibi, "KQ_scaled_alibi");
-
-            struct ggml_tensor * KQ_masked = ggml_add(ctx0, KQ_scaled_alibi, KQ_mask);
-            offload_func_kq(KQ_masked);
-            ggml_set_name(KQ_masked, "KQ_masked");
-
-            // KQ = soft_max(KQ_masked)
-            struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
-            offload_func_v(KQ_soft_max);
-            ggml_set_name(KQ_soft_max, "KQ_soft_max");
-
-            // split cached V into n_head heads
-            struct ggml_tensor * V =
-                ggml_view_3d(ctx0, kv_self.v,
-                        n_kv, n_embd_head, n_head_kv,
-                        ggml_element_size(kv_self.v)*n_ctx,
-                        ggml_element_size(kv_self.v)*n_ctx*n_embd_head,
-                        ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il);
-            offload_func_v(V);
-            ggml_set_name(V, "V");
-
-#if 1
-            struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
-            offload_func_v(KQV);
-            ggml_set_name(KQV, "KQV");
-#else
-            // make V contiguous in memory to speed up the matmul, however we waste time on the copy
-            // on M1 this is faster for the perplexity computation, but ~5% slower for the single-token generation
-            // is there a better way?
-            struct ggml_tensor * V_cont = ggml_cpy(ctx0, V, ggml_new_tensor_3d(ctx0, kv_self.v->type, n_ctx, n_embd_head, n_head));
-            struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_cont, KQ_soft_max);
-#endif
-
-            // KQV_merged = KQV.permute(0, 2, 1, 3)
-            struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
-            offload_func_v(KQV_merged);
-            ggml_set_name(KQV_merged, "KQV_merged");
-
-            // cur = KQV_merged.contiguous().view(n_embd, n_tokens)
-            cur = ggml_cont_2d(ctx0, KQV_merged, n_embd, n_tokens);
-            offload_func_v(cur);
-            ggml_set_name(cur, "KQV_merged_contiguous");
-
-            // projection (no bias)
-            cur = ggml_mul_mat(ctx0,
-                    model.layers[il].wo,
-                    cur);
-            offload_func(cur);
-            ggml_set_name(cur, "result_wo");
-        }
-
-        struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpSA);
-        offload_func(inpFF);
-        ggml_set_name(inpFF, "inpFF");
-
-        // feed-forward network
-        {
-            // norm
-            {
-                cur = ggml_rms_norm(ctx0, inpFF, norm_rms_eps);
-                offload_func(cur);
-                ggml_set_name(cur, "rms_norm_1");
-
-                // cur = cur*ffn_norm(broadcasted)
-                cur = ggml_mul(ctx0, cur, model.layers[il].ffn_norm);
-                offload_func(cur);
-                ggml_set_name(cur, "ffn_norm");
-            }
-
-            struct ggml_tensor * tmp = ggml_mul_mat(ctx0,
-                    model.layers[il].w3,
-                    cur);
-            offload_func(tmp);
-            ggml_set_name(tmp, "result_w3");
-
-            cur = ggml_mul_mat(ctx0,
-                    model.layers[il].w1,
-                    cur);
-            offload_func(cur);
-            ggml_set_name(cur, "result_w1");
-
-            // SILU activation
-            cur = ggml_silu(ctx0, cur);
-            offload_func(cur);
-            ggml_set_name(cur, "silu");
-
-            cur = ggml_mul(ctx0, cur, tmp);
-            offload_func(cur);
-            ggml_set_name(cur, "silu_x_result_w3");
-
-            cur = ggml_mul_mat(ctx0,
-                    model.layers[il].w2,
-                    cur);
-            offload_func(cur);
-            ggml_set_name(cur, "result_w2");
-        }
-
-        cur = ggml_add(ctx0, cur, inpFF);
-        offload_func(cur);
-        ggml_set_name(cur, "inpFF_+_result_w2");
-
-        // input for next layer
-        inpL = cur;
-    }
-
-    cur = inpL;
-
-    // norm
-    {
-        cur = ggml_rms_norm(ctx0, cur, norm_rms_eps);
-        offload_func_nr(cur);
-        ggml_set_name(cur, "rms_norm_2");
-
-        // cur = cur*norm(broadcasted)
-        cur = ggml_mul(ctx0, cur, model.output_norm);
-        // offload_func_nr(cur); // TODO CPU + GPU mirrored backend
-        ggml_set_name(cur, "result_norm");
-    }
-
-    // lm_head
-    cur = ggml_mul_mat(ctx0, model.output, cur);
-    ggml_set_name(cur, "result_output");
-
-    ggml_build_forward_expand(gf, cur);
-
-    ggml_free(ctx0);
-
-    return gf;
-}
-
-static struct ggml_cgraph * llm_build_falcon(
-         llama_context & lctx,
-     const llama_batch & batch) {
-    const auto & model   = lctx.model;
-    const auto & hparams = model.hparams;
-    const auto & cparams = lctx.cparams;
-
-    const auto & kv_self = lctx.kv_self;
-
-    GGML_ASSERT(!!kv_self.ctx);
-
-    const int64_t n_embd      = hparams.n_embd;
-    const int64_t n_layer     = hparams.n_layer;
-    const int64_t n_ctx       = cparams.n_ctx;
-    const int64_t n_head      = hparams.n_head;
-    const int64_t n_head_kv   = hparams.n_head_kv;
-    const int64_t n_embd_head = hparams.n_embd_head();
-    const int64_t n_embd_gqa  = hparams.n_embd_gqa();
-
-    GGML_ASSERT(n_embd_head == hparams.n_rot);
-
-    const float freq_base  = cparams.rope_freq_base;
-    const float freq_scale = cparams.rope_freq_scale;
-    const float norm_eps   = hparams.f_norm_eps;
-
-    const int n_gpu_layers = model.n_gpu_layers;
-
-    const int32_t n_tokens = batch.n_tokens;
-    const int32_t n_kv     = ggml_allocr_is_measure(lctx.alloc) ? n_ctx            : kv_self.n;
-    const int32_t kv_head  = ggml_allocr_is_measure(lctx.alloc) ? n_ctx - n_tokens : kv_self.head;
-
-    const bool do_rope_shift = ggml_allocr_is_measure(lctx.alloc) || kv_self.has_shift;
-
-    //printf("kv_head = %d, n_kv = %d, n_tokens = %d, n_ctx = %d, is_measure = %d, has_shift = %d\n",
-    //        kv_head, n_kv, n_tokens, n_ctx, ggml_allocr_is_measure(lctx.alloc), kv_self.has_shift);
-
-    auto & buf_compute = lctx.buf_compute;
-
-    struct ggml_init_params params = {
-        /*.mem_size   =*/ buf_compute.size,
-        /*.mem_buffer =*/ buf_compute.data,
-        /*.no_alloc   =*/ true,
-    };
-
-    struct ggml_context * ctx0 = ggml_init(params);
-
-    ggml_cgraph * gf = ggml_new_graph(ctx0);
-
-    struct ggml_tensor * cur;
-    struct ggml_tensor * inpL;
-
-    if (batch.token) {
-        struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
-
-        ggml_allocr_alloc(lctx.alloc, inp_tokens);
-        if (!ggml_allocr_is_measure(lctx.alloc)) {
-            memcpy(inp_tokens->data, batch.token, n_tokens*ggml_element_size(inp_tokens));
-        }
-        ggml_set_name(inp_tokens, "inp_tokens");
-
-        inpL = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens);
-    } else {
-#ifdef GGML_USE_MPI
-        GGML_ASSERT(false && "not implemented");
-#endif
-
-        inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens);
-
-        ggml_allocr_alloc(lctx.alloc, inpL);
-        if (!ggml_allocr_is_measure(lctx.alloc)) {
-            memcpy(inpL->data, batch.embd, n_tokens * n_embd * ggml_element_size(inpL));
-        }
-    }
-
-    const int i_gpu_start = n_layer - n_gpu_layers;
-    (void) i_gpu_start;
-
-    // offload functions set the tensor output backend to GPU
-    // tensors are GPU-accelerated if any input or the output has been offloaded
-    offload_func_t offload_func_nr = llama_nop; // nr = non-repeating
-    offload_func_t offload_func_kq = llama_nop;
-    offload_func_t offload_func_v  = llama_nop;
-
-#ifdef GGML_USE_CUBLAS
-    if (n_gpu_layers > n_layer) {
-        offload_func_nr = ggml_cuda_assign_buffers_no_alloc;
-    }
-    if (n_gpu_layers > n_layer + 1) {
-        offload_func_v  = ggml_cuda_assign_buffers_no_alloc;
-    }
-    if (n_gpu_layers > n_layer + 2) {
-        offload_func_kq = ggml_cuda_assign_buffers_no_alloc;
-    }
-#endif // GGML_USE_CUBLAS
-
-    // KQ_scale
-    struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
-    ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
-    ggml_allocr_alloc(lctx.alloc, KQ_scale);
-    if (!ggml_allocr_is_measure(lctx.alloc)) {
-        ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head));
-    }
-
-    // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
-    struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
-    offload_func_kq(KQ_mask);
-    ggml_set_name(KQ_mask, "KQ_mask");
-    ggml_allocr_alloc(lctx.alloc, KQ_mask);
-    if (!ggml_allocr_is_measure(lctx.alloc)) {
-        float * data = (float *) KQ_mask->data;
-        memset(data, 0, ggml_nbytes(KQ_mask));
-
-        for (int h = 0; h < 1; ++h) {
-            for (int j = 0; j < n_tokens; ++j) {
-                const llama_pos    pos    = batch.pos[j];
-                const llama_seq_id seq_id = batch.seq_id[j];
-
-                for (int i = 0; i < n_kv; ++i) {
-                    if (!kv_self.cells[i].has_seq_id(seq_id) || kv_self.cells[i].pos > pos) {
-                        data[h*(n_kv*n_tokens) + j*n_kv + i] = -INFINITY;
-                    }
-                }
-            }
-        }
-    }
-
-    // KQ_pos - contains the positions
-    struct ggml_tensor * KQ_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
-    offload_func_kq(KQ_pos);
-    ggml_set_name(KQ_pos, "KQ_pos");
-    ggml_allocr_alloc(lctx.alloc, KQ_pos);
-    if (!ggml_allocr_is_measure(lctx.alloc)) {
-        int * data = (int *) KQ_pos->data;
-        for (int i = 0; i < n_tokens; ++i) {
-            data[i] = batch.pos[i];
-        }
-    }
-
-    // shift the entire K-cache if needed
-    if (do_rope_shift) {
-        struct ggml_tensor * K_shift = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_ctx);
-        offload_func_kq(K_shift);
-        ggml_set_name(K_shift, "K_shift");
-        ggml_allocr_alloc(lctx.alloc, K_shift);
-        if (!ggml_allocr_is_measure(lctx.alloc)) {
-            int * data = (int *) K_shift->data;
-            for (int i = 0; i < n_ctx; ++i) {
-                data[i] = kv_self.cells[i].delta;
-            }
-        }
-
-        for (int il = 0; il < n_layer; ++il) {
-            struct ggml_tensor * tmp =
-                    ggml_rope_custom_inplace(ctx0,
-                        ggml_view_3d(ctx0, kv_self.k,
-                            n_embd_head, n_head_kv, n_ctx,
-                            ggml_element_size(kv_self.k)*n_embd_head,
-                            ggml_element_size(kv_self.k)*n_embd_gqa,
-                            ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il),
-                        K_shift, n_embd_head, 2, 0, freq_base, freq_scale);
-            offload_func_kq(tmp);
-            ggml_build_forward_expand(gf, tmp);
-        }
-    }
-
-    for (int il = 0; il < n_layer; ++il) {
-        struct ggml_tensor * attn_norm;
-
-        offload_func_t offload_func = llama_nop;
-
-#ifdef GGML_USE_CUBLAS
-        if (il >= i_gpu_start) {
-            offload_func = ggml_cuda_assign_buffers_no_alloc;
-        }
-#endif // GGML_USE_CUBLAS
-
-        // self-attention
-        // TODO: refactor into common function (shared with LLaMA)
-        {
-            attn_norm = ggml_norm(ctx0, inpL, norm_eps);
-            offload_func(attn_norm);
-
-            attn_norm = ggml_add(ctx0,
-                    ggml_mul(ctx0, attn_norm, model.layers[il].attn_norm),
-                    model.layers[il].attn_norm_b);
-            offload_func(attn_norm->src[0]);
-            offload_func(attn_norm);
-
-            if (model.layers[il].attn_norm_2) { // Falcon-40B
-                cur = ggml_norm(ctx0, inpL, norm_eps);
-                offload_func(cur);
-
-                cur = ggml_add(ctx0,
-                        ggml_mul(ctx0, cur, model.layers[il].attn_norm_2),
-                        model.layers[il].attn_norm_2_b);
-                offload_func(cur->src[0]);
-                offload_func(cur);
-            } else { // Falcon 7B
-                cur = attn_norm;
-            }
-
-            // compute QKV
-
-            cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
-            offload_func_kq(cur);
-
-            // Note that the strides for Kcur, Vcur are set up so that the
-            // resulting views are misaligned with the tensor's storage
-            // (by applying the K/V offset we shift the tensor's original
-            // view to stick out behind the viewed QKV tensor's allocated
-            // memory, so to say). This is ok because no actual accesses
-            // happen to that out-of-range memory, but it can require some
-            // trickery when trying to accurately dump these views for
-            // debugging.
-
-            const size_t wsize = ggml_type_size(cur->type);
-
-            // TODO: these 2 ggml_conts are technically not needed, but we add them until CUDA support for
-            //       non-contiguous views is added for the rope operator
-            struct ggml_tensor * tmpq = ggml_cont(ctx0, ggml_view_3d(
-                ctx0, cur, n_embd_head, n_head, n_tokens,
-                wsize * n_embd_head,
-                wsize * n_embd_head * (n_head + 2 * n_head_kv),
-                0));
-            offload_func_kq(tmpq);
-
-            struct ggml_tensor * tmpk = ggml_cont(ctx0, ggml_view_3d(
-                ctx0, cur, n_embd_head, n_head_kv, n_tokens,
-                wsize * n_embd_head,
-                wsize * n_embd_head * (n_head + 2 * n_head_kv),
-                wsize * n_embd_head *  n_head));
-            offload_func_kq(tmpk);
-
-            struct ggml_tensor * tmpv = ggml_view_3d(
-                ctx0, cur, n_embd_head, n_head_kv, n_tokens,
-                wsize * n_embd_head,
-                wsize * n_embd_head * (n_head + 2 * n_head_kv),
-                wsize * n_embd_head * (n_head +     n_head_kv));
-            offload_func_v(tmpv);
-
-            // using mode = 2 for neox mode
-            struct ggml_tensor * Qcur = ggml_rope_custom(ctx0, tmpq, KQ_pos, n_embd_head, 2, 0, freq_base, freq_scale);
-            offload_func_kq(Qcur);
-            struct ggml_tensor * Kcur = ggml_rope_custom(ctx0, tmpk, KQ_pos, n_embd_head, 2, 0, freq_base, freq_scale);
-            offload_func_kq(Kcur);
-
-            {
-                struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, ggml_cont(ctx0, tmpv), n_embd_gqa, n_tokens));
-                offload_func_v(Vcur);
-                offload_func_v(Vcur->src[0]->src[0]);
-                ggml_set_name(Vcur, "Vcur");
-
-                struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, n_tokens*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_head));
-                offload_func_kq(k);
-                ggml_set_name(k, "k");
-
-                struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, n_tokens, n_embd_gqa,
-                        (   n_ctx)*ggml_element_size(kv_self.v),
-                        (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*ggml_element_size(kv_self.v));
-                offload_func_v(v);
-
-                ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
-                ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
-            }
-
-            struct ggml_tensor * Q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3);
-            offload_func_kq(Q);
-            ggml_set_name(Q, "Q");
-
-            struct ggml_tensor * K =
-                ggml_view_3d(ctx0, kv_self.k,
-                        n_embd_head, n_kv, n_head_kv,
-                        ggml_element_size(kv_self.k)*n_embd_gqa,
-                        ggml_element_size(kv_self.k)*n_embd_head,
-                        ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il);
-            offload_func_kq(K);
-            ggml_set_name(K, "K");
-
-            struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
-            offload_func_kq(KQ);
-            ggml_set_name(KQ, "KQ");
-
-            struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, KQ_scale);
-            offload_func_kq(KQ_scaled);
-            ggml_set_name(KQ_scaled, "KQ_scaled");
-
-            struct ggml_tensor * KQ_masked = ggml_add(ctx0, KQ_scaled, KQ_mask);
-            offload_func_kq(KQ_masked);
-            ggml_set_name(KQ_masked, "KQ_masked");
-
-            struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
-            offload_func_v(KQ_soft_max);
-            ggml_set_name(KQ_soft_max, "KQ_soft_max");
-
-            struct ggml_tensor * V =
-                ggml_view_3d(ctx0, kv_self.v,
-                        n_kv, n_embd_head, n_head_kv,
-                        ggml_element_size(kv_self.v)*n_ctx,
-                        ggml_element_size(kv_self.v)*n_ctx*n_embd_head,
-                        ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il);
-            offload_func_v(V);
-            ggml_set_name(V, "V");
-
-            struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
-            offload_func_v(KQV);
-            ggml_set_name(KQV, "KQV");
-
-            struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
-            offload_func_v(KQV_merged);
-            ggml_set_name(KQV_merged, "KQV_merged");
-
-            cur = ggml_cont_2d(ctx0, KQV_merged, n_embd, n_tokens);
-            offload_func_v(cur);
-            ggml_set_name(cur, "KQV_merged_contiguous");
-
-            cur = ggml_mul_mat(ctx0, model.layers[il].wo, cur);
-            offload_func(cur);
-            ggml_set_name(cur, "result_wo");
-        }
-
-        struct ggml_tensor * attn_out = cur;
-
-        // feed forward
-        {
-            struct ggml_tensor * inpFF = attn_norm;
-
-            cur = ggml_mul_mat(ctx0, model.layers[il].w3, inpFF);
-            offload_func(cur);
-
-            cur = ggml_gelu(ctx0, cur);
-            offload_func(cur);
-            cur = ggml_mul_mat(ctx0, model.layers[il].w2, cur);
-            offload_func(cur);
-        }
-
-        cur = ggml_add(ctx0, cur, attn_out);
-        offload_func(cur);
-        cur = ggml_add(ctx0, cur, inpL);
-        offload_func(cur);
-
-        // input for next layer
-        inpL = cur;
-    }
-
-    cur = inpL;
-
-    // norm
-    {
-        cur = ggml_norm(ctx0, cur, norm_eps);
-        offload_func_nr(cur);
-
-        cur = ggml_add(ctx0,
-                ggml_mul(ctx0, cur, model.output_norm),
-                model.output_norm_b);
-        ggml_set_name(cur, "result_norm");
-    }
-
-    cur = ggml_mul_mat(ctx0, model.output, cur);
-    ggml_set_name(cur, "result_output");
-
-    ggml_build_forward_expand(gf, cur);
-
-    ggml_free(ctx0);
-
-    return gf;
-}
-
-static struct ggml_cgraph * llm_build_starcoder(
-         llama_context & lctx,
-     const llama_batch & batch) {
-    const auto & model   = lctx.model;
-    const auto & hparams = model.hparams;
-    const auto & cparams = lctx.cparams;
-
-    const auto & kv_self = lctx.kv_self;
-
-    GGML_ASSERT(!!kv_self.ctx);
-
-    const int64_t n_embd      = hparams.n_embd;
-    const int64_t n_layer     = hparams.n_layer;
-    const int64_t n_ctx       = cparams.n_ctx;
-    const int64_t n_head      = hparams.n_head;
-    const int64_t n_head_kv   = hparams.n_head_kv;
-    const int64_t n_embd_head = hparams.n_embd_head();
-    const int64_t n_embd_gqa  = hparams.n_embd_gqa();
-
-    GGML_ASSERT(n_embd_head == hparams.n_rot);
-
-    const float norm_eps = hparams.f_norm_eps;
-
-    const int32_t n_tokens = batch.n_tokens;
-    const int32_t n_kv     = ggml_allocr_is_measure(lctx.alloc) ? n_ctx            : kv_self.n;
-    const int32_t kv_head  = ggml_allocr_is_measure(lctx.alloc) ? n_ctx - n_tokens : kv_self.head;
-
-    auto & buf_compute = lctx.buf_compute;
-
-    struct ggml_init_params params = {
-        /*.mem_size   =*/ buf_compute.size,
-        /*.mem_buffer =*/ buf_compute.data,
-        /*.no_alloc   =*/ true,
-    };
-
-    struct ggml_context * ctx0 = ggml_init(params);
-
-    ggml_cgraph * gf = ggml_new_graph(ctx0);
-
-    struct ggml_tensor * cur;
-    struct ggml_tensor * token;
-    struct ggml_tensor * position;
-    struct ggml_tensor * inpL;
-
-    if (batch.token) {
-        struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
-
-        ggml_allocr_alloc(lctx.alloc, inp_tokens);
-        if (!ggml_allocr_is_measure(lctx.alloc)) {
-            memcpy(inp_tokens->data, batch.token, n_tokens*ggml_element_size(inp_tokens));
-        }
-        ggml_set_name(inp_tokens, "inp_tokens");
-
-        token = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens);
-    } else {
-#ifdef GGML_USE_MPI
-        GGML_ASSERT(false && "not implemented");
-#endif
-
-        token = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens);
-
-        ggml_allocr_alloc(lctx.alloc, token);
-        if (!ggml_allocr_is_measure(lctx.alloc)) {
-            memcpy(token->data, batch.embd, n_tokens * n_embd * ggml_element_size(token));
-        }
-    }
-
-    {
-        // Compute position embeddings.
-        struct ggml_tensor * inp_positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
-        ggml_allocr_alloc(lctx.alloc, inp_positions);
-        if (!ggml_allocr_is_measure(lctx.alloc)) {
-            for (int i = 0; i < n_tokens; ++i) {
-                ((int32_t *) inp_positions->data)[i] = batch.pos[i];
-            }
-        }
-        ggml_set_name(inp_positions, "inp_positions");
-
-        position = ggml_get_rows(ctx0, model.pos_embeddings, inp_positions);
-    }
-
-    // KQ_scale
-    struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
-    ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
-    ggml_allocr_alloc(lctx.alloc, KQ_scale);
-    if (!ggml_allocr_is_measure(lctx.alloc)) {
-        ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head));
-    }
-
-    // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
-    struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
-    ggml_set_name(KQ_mask, "KQ_mask");
-    ggml_allocr_alloc(lctx.alloc, KQ_mask);
-    if (!ggml_allocr_is_measure(lctx.alloc)) {
-        float * data = (float *) KQ_mask->data;
-        memset(data, 0, ggml_nbytes(KQ_mask));
-
-        for (int h = 0; h < 1; ++h) {
-            for (int j = 0; j < n_tokens; ++j) {
-                const llama_pos    pos    = batch.pos[j];
-                const llama_seq_id seq_id = batch.seq_id[j];
-
-                for (int i = 0; i < n_kv; ++i) {
-                    if (!kv_self.cells[i].has_seq_id(seq_id) || kv_self.cells[i].pos > pos) {
-                        data[h*(n_kv*n_tokens) + j*n_kv + i] = -INFINITY;
-                    }
-                }
-            }
-        }
-    }
-
-    inpL = ggml_add(ctx0, token, position);
-    ggml_set_name(inpL, "inpL");
-
-    for (int il = 0; il < n_layer; ++il) {
-        {
-            // Norm
-            cur = ggml_norm(ctx0, inpL, norm_eps);
-            cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.layers[il].attn_norm), model.layers[il].attn_norm_b);
-        }
-
-        {
-            // Self Attention
-            cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wqkv, cur), model.layers[il].bqkv);
-
-            struct ggml_tensor * tmpq = ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*n_embd);
-            struct ggml_tensor * tmpk = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], sizeof(float)*n_embd);
-            struct ggml_tensor * tmpv = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], sizeof(float)*(n_embd + n_embd_gqa));
-
-            struct ggml_tensor * Qcur = tmpq;
-            struct ggml_tensor * Kcur = tmpk;
-
-            {
-                struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, ggml_cont(ctx0, tmpv), n_embd_gqa, n_tokens));
-                ggml_set_name(Vcur, "Vcur");
-
-                struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, n_tokens*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_head));
-                ggml_set_name(k, "k");
-
-                struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, n_tokens, n_embd_gqa,
-                        (   n_ctx)*ggml_element_size(kv_self.v),
-                        (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*ggml_element_size(kv_self.v));
-
-                ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
-                ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
-            }
-
-            struct ggml_tensor * Q =
-                ggml_permute(ctx0,
-                        ggml_cpy(ctx0,
-                            Qcur,
-                            ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd_head, n_head, n_tokens)),
-                        0, 2, 1, 3);
-            ggml_set_name(Q, "Q");
-
-            struct ggml_tensor * K =
-                ggml_view_3d(ctx0, kv_self.k,
-                        n_embd_head, n_kv, n_head_kv,
-                        ggml_element_size(kv_self.k)*n_embd_gqa,
-                        ggml_element_size(kv_self.k)*n_embd_head,
-                        ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il);
-            ggml_set_name(K, "K");
-
-            // K * Q
-            struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
-            ggml_set_name(KQ, "KQ");
-
-            // KQ_scaled = KQ / sqrt(n_embd_head)
-            // KQ_scaled shape [n_past + n_tokens, n_tokens, n_head, 1]
-            struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, KQ_scale);
-            ggml_set_name(KQ_scaled, "KQ_scaled");
-
-            // KQ_masked = mask_past(KQ_scaled)
-            struct ggml_tensor * KQ_masked = ggml_add(ctx0, KQ_scaled, KQ_mask);
-            ggml_set_name(KQ_masked, "KQ_masked");
-
-            // KQ = soft_max(KQ_masked)
-            struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);
-            ggml_set_name(KQ_soft_max, "KQ_soft_max");
-
-            // split cached V into n_head heads
-            struct ggml_tensor * V =
-                ggml_view_3d(ctx0, kv_self.v,
-                        n_kv, n_embd_head, n_head_kv,
-                        ggml_element_size(kv_self.v)*n_ctx,
-                        ggml_element_size(kv_self.v)*n_ctx*n_embd_head,
-                        ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il);
-            ggml_set_name(V, "V");
-
-            struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
-            ggml_set_name(KQV, "KQV");
-
-            // KQV_merged = KQV.permute(0, 2, 1, 3)
-            struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
-            ggml_set_name(KQV_merged, "KQV_merged");
-
-            // cur = KQV_merged.contiguous().view(n_embd, n_tokens)
-            cur = ggml_cont_2d(ctx0, KQV_merged, n_embd, n_tokens);
-            ggml_set_name(cur, "KQV_merged_contiguous");
-        }
-
-        // Projection
-        cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wo, cur), model.layers[il].bo);
-
-        // Add the input
-        cur = ggml_add(ctx0, cur, inpL);
-
-        struct ggml_tensor * inpFF = cur;
-
-        // FF
-        {
-            // Norm
-            {
-                cur = ggml_norm(ctx0, inpFF, norm_eps);
-                cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.layers[il].ffn_norm), model.layers[il].ffn_norm_b);
-            }
-
-            cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].w3, cur), model.layers[il].b3);
-
-            // GELU activation
-            cur = ggml_gelu(ctx0, cur);
-
-            // Projection
-            cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].w2, cur), model.layers[il].b2);
-        }
-
-        inpL = ggml_add(ctx0, cur, inpFF);
-    }
-
-    // Output Norm
-    {
-        cur = ggml_norm(ctx0, inpL, norm_eps);
-        cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.output_norm), model.output_norm_b);
-    }
-    ggml_set_name(cur, "result_norm");
-
-    cur = ggml_mul_mat(ctx0, model.output, cur);
-    ggml_set_name(cur, "result_output");
-
-    ggml_build_forward_expand(gf, cur);
-    ggml_free(ctx0);
-
-    return gf;
-}
-
-static struct ggml_cgraph * llm_build_persimmon(
-         llama_context & lctx,
-     const llama_batch & batch) {
-    const auto & model = lctx.model;
-    const auto & hparams = model.hparams;
-
-    const auto & kv_self = lctx.kv_self;
-
-    GGML_ASSERT(!!kv_self.ctx);
-
-    const auto & cparams = lctx.cparams;
-    const int64_t n_embd      = hparams.n_embd;
-    const int64_t n_layer     = hparams.n_layer;
-    const int64_t n_ctx       = cparams.n_ctx;
-    const int64_t n_head_kv   = hparams.n_head_kv;
-    const int64_t n_head      = hparams.n_head;
-    const int64_t n_embd_head = hparams.n_embd_head();
-    const int64_t n_embd_gqa  = hparams.n_embd_gqa();
-    const size_t n_rot        = n_embd_head / 2;
-
-    const float freq_base  = cparams.rope_freq_base;
-    const float freq_scale = cparams.rope_freq_scale;
-    const float norm_eps = hparams.f_norm_eps;
-
-    const int n_gpu_layers = model.n_gpu_layers;
-
-
-    const int32_t n_tokens    = batch.n_tokens;
-    const int32_t n_kv        = ggml_allocr_is_measure(lctx.alloc) ? n_ctx            : kv_self.n;
-    const int32_t kv_head     = ggml_allocr_is_measure(lctx.alloc) ? n_ctx - n_tokens : kv_self.head;
-
-    const bool do_rope_shift  = ggml_allocr_is_measure(lctx.alloc) || kv_self.has_shift;
-
-    auto & buf_compute = lctx.buf_compute;
-    struct ggml_init_params params = {
-        /*.mem_size   =*/ buf_compute.size,
-        /*.mem_buffer =*/ buf_compute.data,
-        /*.no_alloc   =*/ true,
-    };
-
-    struct ggml_context * ctx0 = ggml_init(params);
-
-    ggml_cgraph * gf = ggml_new_graph(ctx0);
-
-    struct ggml_tensor * cur;
-    struct ggml_tensor * inpL;
-
-    if (batch.token) {
-        struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
-
-        ggml_allocr_alloc(lctx.alloc, inp_tokens);
-        if (!ggml_allocr_is_measure(lctx.alloc)) {
-            memcpy(inp_tokens->data, batch.token, n_tokens*ggml_element_size(inp_tokens));
-        }
-        ggml_set_name(inp_tokens, "inp_tokens");
-        inpL = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens);
-    } else {
-        inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens);
-        ggml_allocr_alloc(lctx.alloc, inpL);
-        if (!ggml_allocr_is_measure(lctx.alloc)) {
-            memcpy(inpL->data, batch.embd, n_tokens * n_embd * ggml_element_size(inpL));
-        }
-    }
-    const int i_gpu_start = n_layer - n_gpu_layers;
-    (void) i_gpu_start;
-    offload_func_t offload_func_nr = llama_nop; // nr = non-repeating
-    offload_func_t offload_func_kq = llama_nop;
-    offload_func_t offload_func_v  = llama_nop;
-    // KQ_scale
-    struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
-    ggml_allocr_alloc(lctx.alloc, KQ_scale);
-    if (!ggml_allocr_is_measure(lctx.alloc)) {
-        ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd_head)));
-    }
-    ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
-    struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
-    offload_func_kq(KQ_mask);
-    ggml_set_name(KQ_mask, "KQ_mask");
-    ggml_allocr_alloc(lctx.alloc, KQ_mask);
-
-    if (!ggml_allocr_is_measure(lctx.alloc)) {
-        float * data = (float *) KQ_mask->data;
-        memset(data, 0, ggml_nbytes(KQ_mask));
-        for (int h = 0; h < 1; ++h) {
-            for (int j = 0; j < n_tokens; ++j) {
-                const llama_pos    pos    = batch.pos[j];
-                const llama_seq_id seq_id = batch.seq_id[j];
-                for (int i = 0; i < n_kv; ++i) {
-                    if (!kv_self.cells[i].has_seq_id(seq_id) || kv_self.cells[i].pos > pos) {
-                        data[h*(n_kv*n_tokens) + j*n_kv + i] = -INFINITY;
-                    }
-                }
-            }
-        }
-    }
-
-    struct ggml_tensor * KQ_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
-    offload_func_kq(KQ_pos);
-    ggml_set_name(KQ_pos, "KQ_pos");
-    ggml_allocr_alloc(lctx.alloc, KQ_pos);
-    if (!ggml_allocr_is_measure(lctx.alloc)) {
-        int * data = (int *) KQ_pos->data;
-        for (int i = 0; i < n_tokens; ++i) {
-            data[i] = batch.pos[i];
-        }
-    }
-    if (do_rope_shift) {
-        struct ggml_tensor * K_shift = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_ctx);
-        offload_func_kq(K_shift);
-        ggml_set_name(K_shift, "K_shift");
-        ggml_allocr_alloc(lctx.alloc, K_shift);
-        if (!ggml_allocr_is_measure(lctx.alloc)) {
-            int * data = (int *) K_shift->data;
-            for (int i = 0; i < n_ctx; ++i) {
-                data[i] = kv_self.cells[i].delta;
-            }
-        }
-        for (int il = 0; il < n_layer; ++il) {
-            struct ggml_tensor * tmp =
-                    // we rotate only the first n_rot dimensions.
-                    ggml_rope_custom_inplace(ctx0,
-                        ggml_view_3d(ctx0, kv_self.k,
-                            n_rot, n_head, n_ctx,
-                            ggml_element_size(kv_self.k)*n_embd_gqa,
-                            ggml_element_size(kv_self.k)*n_embd_head,
-                            ggml_element_size(kv_self.k)*(n_embd_head*n_ctx*il)
-                        ),
-                        K_shift, n_rot, 2, 0, freq_base, freq_scale);
-            offload_func_kq(tmp);
-            ggml_build_forward_expand(gf, tmp);
-        }
-    }
-    for (int il=0; il < n_layer; ++il) {
-        struct ggml_tensor * residual = inpL;
-        offload_func_t offload_func = llama_nop;
-        {
-            cur = ggml_norm(ctx0, inpL, norm_eps);
-            offload_func(cur);
-            cur = ggml_mul(ctx0, cur, model.layers[il].attn_norm);
-            offload_func(cur);
-            cur = ggml_add(ctx0, cur, model.layers[il].attn_norm_b);
-            offload_func(cur);
-            ggml_format_name(cur, "input_layernorm_%d", il);
-        }
-        // self attention
-        {
-            cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
-            offload_func_kq(cur);
-            cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
-            offload_func_kq(cur);
-
-            // split qkv
-            GGML_ASSERT(n_head_kv == n_head);
-            ggml_set_name(cur, format("qkv_%d", il).c_str());
-            struct ggml_tensor * tmpqkv = ggml_reshape_4d(ctx0, cur, n_embd_head, 3, n_head, n_tokens);
-            offload_func_kq(tmpqkv);
-            struct ggml_tensor * tmpqkv_perm = ggml_cont(ctx0, ggml_permute(ctx0, tmpqkv, 0, 3, 1, 2));
-            offload_func_kq(tmpqkv_perm);
-            ggml_format_name(tmpqkv_perm, "tmpqkv_perm_%d", il);
-            struct ggml_tensor * tmpq = ggml_view_3d(
-                    ctx0, tmpqkv_perm, n_embd_head, n_head, n_tokens,
-                    ggml_element_size(tmpqkv_perm) * n_embd_head,
-                    ggml_element_size(tmpqkv_perm) * n_embd_head * n_head,
-                    0
+                Qcur = ggml_rope_custom(
+                    ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head,    n_tokens), inp_pos,
+                    n_embd_head, 0, 0, n_orig_ctx, freq_base, freq_scale,
+                    ext_factor, attn_factor, beta_fast, beta_slow
                 );
-            offload_func_kq(tmpq);
-            struct ggml_tensor * tmpk = ggml_view_3d(
-                    ctx0, tmpqkv_perm, n_embd_head, n_head, n_tokens,
-                    ggml_element_size(tmpqkv_perm) * n_embd_head,
-                    ggml_element_size(tmpqkv_perm) * n_embd_head * n_head,
-                    ggml_element_size(tmpqkv_perm) * n_embd_head * n_head * n_tokens
+                cb(Qcur, "Qcur", il);
+
+                Kcur = ggml_rope_custom(
+                    ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
+                    n_embd_head, 0, 0, n_orig_ctx, freq_base, freq_scale,
+                    ext_factor, attn_factor, beta_fast, beta_slow
                 );
-            offload_func_kq(tmpk);
-            // Q/K Layernorm
-            tmpq = ggml_norm(ctx0, tmpq, norm_eps);
-            offload_func_kq(tmpq);
-            tmpq =  ggml_mul(ctx0, tmpq, model.layers[il].attn_q_norm);
-            offload_func_kq(tmpq);
-            tmpq =  ggml_add(ctx0, tmpq, model.layers[il].attn_q_norm_b);
-            offload_func_kq(tmpq);
+                cb(Kcur, "Kcur", il);
 
-            tmpk = ggml_norm(ctx0, tmpk, norm_eps);
-            offload_func_v(tmpk);
-            tmpk =  ggml_mul(ctx0, tmpk, model.layers[il].attn_k_norm);
-            offload_func_v(tmpk);
-            tmpk =  ggml_add(ctx0, tmpk, model.layers[il].attn_k_norm_b);
-            offload_func_v(tmpk);
+                llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
 
-            // RoPE the first n_rot of q/k, pass the other half, and concat.
-            struct ggml_tensor * qrot = ggml_view_3d(
-                ctx0, tmpq, n_rot, n_head, n_tokens,
-                ggml_element_size(tmpq) * n_embd_head,
-                ggml_element_size(tmpq) * n_embd_head * n_head,
-                0
-            );
-            offload_func_kq(qrot);
-            ggml_format_name(qrot, "qrot_%d", il);
-            struct ggml_tensor * krot = ggml_view_3d(
-                ctx0, tmpk, n_rot, n_head, n_tokens,
-                ggml_element_size(tmpk) * n_embd_head,
-                ggml_element_size(tmpk) * n_embd_head * n_head,
-                0
-            );
-            offload_func_kq(krot);
-            ggml_format_name(krot, "krot_%d", il);
+                cur = llm_build_kqv(ctx0, hparams, kv_self,
+                        model.layers[il].wo, NULL,
+                        Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, cb, il);
+                cb(cur, "kqv_out", il);
+            }
 
-            // get the second half of tmpq, e.g tmpq[n_rot:, :, :]
-            struct ggml_tensor * qpass = ggml_view_3d(
-                ctx0, tmpq, n_rot, n_head, n_tokens,
-                ggml_element_size(tmpq) * n_embd_head,
-                ggml_element_size(tmpq) * n_embd_head * n_head,
-                ggml_element_size(tmpq) * n_rot
-            );
-            offload_func_kq(qpass);
-            ggml_format_name(qpass, "qpass_%d", il);
-            struct ggml_tensor * kpass = ggml_view_3d(
-                ctx0, tmpk, n_rot, n_head, n_tokens,
-                ggml_element_size(tmpk) * n_embd_head,
-                ggml_element_size(tmpk) * n_embd_head * n_head,
-                ggml_element_size(tmpk) * n_rot
-            );
-            offload_func_kq(kpass);
-            ggml_format_name(kpass, "kpass_%d", il);
+            struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+            cb(ffn_inp, "ffn_inp", il);
 
-            struct ggml_tensor * qrotated =  ggml_rope_custom(
-                    ctx0, qrot, KQ_pos, n_rot, 2, 0, freq_base, freq_scale
-            );
-            offload_func_kq(qrotated);
-            struct ggml_tensor * krotated = ggml_rope_custom(
-                    ctx0, krot, KQ_pos, n_rot, 2, 0, freq_base, freq_scale
-            );
-            offload_func_kq(krotated);
-            // ggml currently only supports concatenation on dim=2
-            // so we need to permute qrot, qpass, concat, then permute back.
-            qrotated = ggml_cont(ctx0, ggml_permute(ctx0, qrotated, 2, 1, 0, 3));
-            offload_func_kq(qrotated);
-            krotated = ggml_cont(ctx0, ggml_permute(ctx0, krotated, 2, 1, 0, 3));
-            offload_func_kq(krotated);
-
-            qpass = ggml_cont(ctx0, ggml_permute(ctx0, qpass, 2, 1, 0, 3));
-            offload_func_kq(qpass);
-            kpass = ggml_cont(ctx0, ggml_permute(ctx0, kpass, 2, 1, 0, 3));
-            offload_func_kq(kpass);
-
-            struct ggml_tensor * Qcur = ggml_concat(ctx0, qrotated, qpass);
-            offload_func_kq(Qcur);
-            struct ggml_tensor * Kcur = ggml_concat(ctx0, krotated, kpass);
-            offload_func_kq(Kcur);
-
-            struct ggml_tensor * Q = ggml_cont(ctx0, ggml_permute(ctx0, Qcur, 1, 2, 0, 3));
-            offload_func_kq(Q);
-
-            Kcur = ggml_cont(ctx0, ggml_permute(ctx0, Kcur, 2, 1, 0, 3));
-            offload_func_kq(Kcur);
+            // feed-forward network
             {
-                struct ggml_tensor * tmpv = ggml_view_3d(
+                cur = llm_build_norm(ctx0, ffn_inp, hparams,
+                        model.layers[il].ffn_norm, NULL,
+                        LLM_NORM_RMS, cb, il);
+                cb(cur, "ffn_norm", il);
+
+                cur = llm_build_ffn(ctx0, cur,
+                        model.layers[il].ffn_up,   NULL,
+                        model.layers[il].ffn_gate, NULL,
+                        model.layers[il].ffn_down, NULL,
+                        LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
+                cb(cur, "ffn_out", il);
+            }
+
+            cur = ggml_add(ctx0, cur, ffn_inp);
+            cb(cur, "l_out", il);
+
+            // input for next layer
+            inpL = cur;
+        }
+
+        cur = inpL;
+
+        cur = llm_build_norm(ctx0, cur, hparams,
+                model.output_norm, NULL,
+                LLM_NORM_RMS, cb, -1);
+        cb(cur, "result_norm", -1);
+
+        // lm_head
+        cur = ggml_mul_mat(ctx0, model.output, cur);
+        cb(cur, "result_output", -1);
+
+        ggml_build_forward_expand(gf, cur);
+
+        return gf;
+    }
+
+    struct ggml_cgraph * build_baichuan() {
+        struct ggml_cgraph * gf = ggml_new_graph(ctx0);
+
+        struct ggml_tensor * cur;
+        struct ggml_tensor * inpL;
+
+        inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb);
+        cb(inpL, "inp_embd", -1);
+
+        // inp_pos - contains the positions
+        struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
+        cb(inp_pos, "inp_pos", -1);
+
+        // KQ_scale
+        struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
+        cb(KQ_scale, "KQ_scale", -1);
+
+        // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
+        struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
+        cb(KQ_mask, "KQ_mask", -1);
+
+        // shift the entire K-cache if needed
+        if (do_rope_shift) {
+            llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE, n_ctx, n_embd_head, freq_base, freq_scale, cb);
+        }
+
+        for (int il = 0; il < n_layer; ++il) {
+            struct ggml_tensor * inpSA = inpL;
+
+            cur = llm_build_norm(ctx0, inpL, hparams,
+                    model.layers[il].attn_norm, NULL,
+                    LLM_NORM_RMS, cb, il);
+            cb(cur, "attn_norm", il);
+
+            // self-attention
+            {
+                struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
+                cb(Qcur, "Qcur", il);
+
+                struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
+                cb(Kcur, "Kcur", il);
+
+                struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
+                cb(Vcur, "Vcur", il);
+
+                switch (model.type) {
+                    case MODEL_7B:
+                        Qcur = ggml_rope_custom(
+                            ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
+                            n_embd_head, 0, 0, n_orig_ctx, freq_base, freq_scale,
+                            ext_factor, attn_factor, beta_fast, beta_slow
+                        );
+                        Kcur = ggml_rope_custom(
+                            ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
+                            n_embd_head, 0, 0, n_orig_ctx, freq_base, freq_scale,
+                            ext_factor, attn_factor, beta_fast, beta_slow
+                        );
+                        break;
+                    case MODEL_13B:
+                        Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd/n_head, n_head, n_tokens);
+                        Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd/n_head, n_head, n_tokens);
+                        break;
+                    default:
+                        GGML_ASSERT(false);
+                }
+                cb(Qcur, "Qcur", il);
+                cb(Kcur, "Kcur", il);
+
+                llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
+
+                // apply ALiBi for 13B model
+                const float max_alibi_bias = model.type == MODEL_13B ? 8.0f : -1.0f;
+
+                cur = llm_build_kqv(ctx0, hparams, kv_self,
+                        model.layers[il].wo, NULL,
+                        Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, max_alibi_bias, cb, il);
+                cb(cur, "kqv_out", il);
+            }
+
+            struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+            cb(ffn_inp, "ffn_inp", il);
+
+            // feed-forward network
+            {
+                cur = llm_build_norm(ctx0, ffn_inp, hparams,
+                        model.layers[il].ffn_norm, NULL,
+                        LLM_NORM_RMS, cb, il);
+                cb(cur, "ffn_norm", il);
+
+                cur = llm_build_ffn(ctx0, cur,
+                        model.layers[il].ffn_up,   NULL,
+                        model.layers[il].ffn_gate, NULL,
+                        model.layers[il].ffn_down, NULL,
+                        LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
+                cb(cur, "ffn_out", il);
+            }
+
+            cur = ggml_add(ctx0, cur, ffn_inp);
+            cb(cur, "l_out", il);
+
+            // input for next layer
+            inpL = cur;
+        }
+
+        cur = inpL;
+
+        cur = llm_build_norm(ctx0, cur, hparams,
+                model.output_norm, NULL,
+                LLM_NORM_RMS, cb, -1);
+        cb(cur, "result_norm", -1);
+
+        // lm_head
+        cur = ggml_mul_mat(ctx0, model.output, cur);
+        cb(cur, "result_output", -1);
+
+        ggml_build_forward_expand(gf, cur);
+
+        return gf;
+    }
+
+    struct ggml_cgraph * build_falcon() {
+        struct ggml_cgraph * gf = ggml_new_graph(ctx0);
+
+        struct ggml_tensor * cur;
+        struct ggml_tensor * inpL;
+
+        inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb);
+        cb(inpL, "inp_embd", -1);
+
+        // inp_pos - contains the positions
+        struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
+        cb(inp_pos, "inp_pos", -1);
+
+        // KQ_scale
+        struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
+        cb(KQ_scale, "KQ_scale", -1);
+
+        // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
+        struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
+        cb(KQ_mask, "KQ_mask", -1);
+
+        // shift the entire K-cache if needed
+        if (do_rope_shift) {
+            llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE_NEOX, n_ctx, n_embd_head, freq_base, freq_scale, cb);
+        }
+
+        for (int il = 0; il < n_layer; ++il) {
+            struct ggml_tensor * attn_norm;
+
+            attn_norm = llm_build_norm(ctx0, inpL, hparams,
+                    model.layers[il].attn_norm,
+                    model.layers[il].attn_norm_b,
+                    LLM_NORM, cb, il);
+            cb(attn_norm, "attn_norm", il);
+
+            // self-attention
+            {
+                if (model.layers[il].attn_norm_2) {
+                    // Falcon-40B
+                    cur = llm_build_norm(ctx0, inpL, hparams,
+                            model.layers[il].attn_norm_2,
+                            model.layers[il].attn_norm_2_b,
+                            LLM_NORM, cb, il);
+                    cb(cur, "attn_norm_2", il);
+                } else {
+                    cur = attn_norm;
+                }
+
+                cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
+                cb(cur, "wqkv", il);
+
+                struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd,     n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
+                struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
+                struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
+
+                cb(Qcur, "Qcur", il);
+                cb(Kcur, "Kcur", il);
+                cb(Vcur, "Vcur", il);
+
+                Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head,    n_tokens);
+                Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
+
+                // using mode = 2 for neox mode
+                Qcur = ggml_rope_custom(
+                    ctx0, Qcur, inp_pos, n_embd_head, 2, 0, n_orig_ctx,
+                    freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
+                );
+                cb(Qcur, "Qcur", il);
+
+                Kcur = ggml_rope_custom(
+                    ctx0, Kcur, inp_pos, n_embd_head, 2, 0, n_orig_ctx,
+                    freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
+                );
+                cb(Kcur, "Kcur", il);
+
+                llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
+
+                cur = llm_build_kqv(ctx0, hparams, kv_self,
+                        model.layers[il].wo, NULL,
+                        Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, cb, il);
+                cb(cur, "kqv_out", il);
+            }
+
+            struct ggml_tensor * ffn_inp = cur;
+
+            // feed forward
+            {
+                cur = llm_build_ffn(ctx0, attn_norm, // !! use the attn norm, not the result
+                        model.layers[il].ffn_up,   NULL,
+                        NULL,                      NULL,
+                        model.layers[il].ffn_down, NULL,
+                        LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
+                cb(cur, "ffn_out", il);
+            }
+
+            cur = ggml_add(ctx0, cur, ffn_inp);
+            cb(cur, "l_out", il);
+
+            cur = ggml_add(ctx0, cur, inpL);
+            cb(cur, "l_out", il);
+
+            // input for next layer
+            inpL = cur;
+        }
+
+        cur = inpL;
+
+        // norm
+        cur = llm_build_norm(ctx0, cur, hparams,
+                model.output_norm,
+                model.output_norm_b,
+                LLM_NORM, cb, -1);
+        cb(cur, "result_norm", -1);
+
+        cur = ggml_mul_mat(ctx0, model.output, cur);
+        cb(cur, "result_output", -1);
+
+        ggml_build_forward_expand(gf, cur);
+
+        return gf;
+    }
+
+    struct ggml_cgraph * build_starcoder() {
+        struct ggml_cgraph * gf = ggml_new_graph(ctx0);
+
+        struct ggml_tensor * cur;
+        struct ggml_tensor * pos;
+        struct ggml_tensor * inpL;
+
+        inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb);
+        cb(inpL, "inp_embd", -1);
+
+        // inp_pos - contains the positions
+        struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
+        cb(inp_pos, "inp_pos", -1);
+
+        // KQ_scale
+        struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
+        cb(KQ_scale, "KQ_scale", -1);
+
+        // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
+        struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
+        cb(KQ_mask, "KQ_mask", -1);
+
+        pos = ggml_get_rows(ctx0, model.pos_embd, inp_pos);
+        cb(pos, "pos_embd", -1);
+
+        inpL = ggml_add(ctx0, inpL, pos);
+        cb(inpL, "inpL", -1);
+
+        for (int il = 0; il < n_layer; ++il) {
+            cur = llm_build_norm(ctx0, inpL, hparams,
+                    model.layers[il].attn_norm,
+                    model.layers[il].attn_norm_b,
+                    LLM_NORM, cb, il);
+            cb(cur, "attn_norm", il);
+
+            // self-attention
+            {
+                cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
+                cb(cur, "wqkv", il);
+
+                cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
+                cb(cur, "bqkv", il);
+
+                struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd,     n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
+                struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
+                struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
+
+                cb(Qcur, "Qcur", il);
+                cb(Kcur, "Kcur", il);
+                cb(Vcur, "Vcur", il);
+
+                Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
+
+                llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
+
+                cur = llm_build_kqv(ctx0, hparams, kv_self,
+                        model.layers[il].wo, model.layers[il].bo,
+                        Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, cb, il);
+                cb(cur, "kqv_out", il);
+            }
+
+            // add the input
+            struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
+            cb(ffn_inp, "ffn_inp", il);
+
+            // FF
+            {
+                cur = llm_build_norm(ctx0, ffn_inp, hparams,
+                        model.layers[il].ffn_norm,
+                        model.layers[il].ffn_norm_b,
+                        LLM_NORM, cb, il);
+                cb(cur, "ffn_norm", il);
+
+                cur = llm_build_ffn(ctx0, cur,
+                        model.layers[il].ffn_up,   model.layers[il].ffn_up_b,
+                        NULL,                      NULL,
+                        model.layers[il].ffn_down, model.layers[il].ffn_down_b,
+                        LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
+                cb(cur, "ffn_out", il);
+            }
+
+            inpL = ggml_add(ctx0, cur, ffn_inp);
+            cb(inpL, "l_out", il);
+        }
+
+        cur = llm_build_norm(ctx0, inpL, hparams,
+                model.output_norm,
+                model.output_norm_b,
+                LLM_NORM, cb, -1);
+        cb(cur, "result_norm", -1);
+
+        cur = ggml_mul_mat(ctx0, model.output, cur);
+        cb(cur, "result_output", -1);
+
+        ggml_build_forward_expand(gf, cur);
+
+        return gf;
+    }
+
+    struct ggml_cgraph * build_persimmon() {
+        struct ggml_cgraph * gf = ggml_new_graph(ctx0);
+
+        const int64_t n_rot = n_embd_head / 2;
+
+        struct ggml_tensor * cur;
+        struct ggml_tensor * inpL;
+
+        inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb);
+        cb(inpL, "imp_embd", -1);
+
+        struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
+        cb(inp_pos, "inp_pos", -1);
+
+        // KQ_scale
+        struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
+        cb(KQ_scale, "KQ_scale", -1);
+
+        struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
+        cb(KQ_mask, "KQ_mask", -1);
+
+        if (do_rope_shift) {
+            llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE_NEOX, n_ctx, n_embd_head, freq_base, freq_scale, cb);
+        }
+
+        for (int il = 0; il < n_layer; ++il) {
+            struct ggml_tensor * residual = inpL;
+
+            cur = llm_build_norm(ctx0, inpL, hparams,
+                    model.layers[il].attn_norm,
+                    model.layers[il].attn_norm_b,
+                    LLM_NORM, cb, il);
+            cb(cur, "attn_norm", il);
+
+            // self attention
+            {
+                cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
+                cb(cur, "wqkv", il);
+
+                cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
+                cb(cur, "bqkv", il);
+
+                // split qkv
+                GGML_ASSERT(n_head_kv == n_head);
+
+                struct ggml_tensor * tmpqkv = ggml_reshape_4d(ctx0, cur, n_embd_head, 3, n_head, n_tokens);
+                cb(tmpqkv, "tmpqkv", il);
+
+                struct ggml_tensor * tmpqkv_perm = ggml_cont(ctx0, ggml_permute(ctx0, tmpqkv, 0, 3, 1, 2));
+                cb(tmpqkv_perm, "tmpqkv", il);
+
+                struct ggml_tensor * tmpq = ggml_view_3d(
+                        ctx0, tmpqkv_perm, n_embd_head, n_head, n_tokens,
+                        ggml_element_size(tmpqkv_perm) * n_embd_head,
+                        ggml_element_size(tmpqkv_perm) * n_embd_head * n_head,
+                        0
+                        );
+                cb(tmpq, "tmpq", il);
+
+                struct ggml_tensor * tmpk = ggml_view_3d(
+                        ctx0, tmpqkv_perm, n_embd_head, n_head, n_tokens,
+                        ggml_element_size(tmpqkv_perm) * n_embd_head,
+                        ggml_element_size(tmpqkv_perm) * n_embd_head * n_head,
+                        ggml_element_size(tmpqkv_perm) * n_embd_head * n_head * n_tokens
+                        );
+                cb(tmpk, "tmpk", il);
+
+                // Q/K Layernorm
+                tmpq = llm_build_norm(ctx0, tmpq, hparams,
+                        model.layers[il].attn_q_norm,
+                        model.layers[il].attn_q_norm_b,
+                        LLM_NORM, cb, il);
+                cb(tmpq, "tmpq", il);
+
+                tmpk = llm_build_norm(ctx0, tmpk, hparams,
+                        model.layers[il].attn_k_norm,
+                        model.layers[il].attn_k_norm_b,
+                        LLM_NORM, cb, il);
+                cb(tmpk, "tmpk", il);
+
+                // RoPE the first n_rot of q/k, pass the other half, and concat.
+                struct ggml_tensor * qrot = ggml_view_3d(
+                        ctx0, tmpq, n_rot, n_head, n_tokens,
+                        ggml_element_size(tmpq) * n_embd_head,
+                        ggml_element_size(tmpq) * n_embd_head * n_head,
+                        0
+                        );
+                cb(qrot, "qrot", il);
+
+                struct ggml_tensor * krot = ggml_view_3d(
+                        ctx0, tmpk, n_rot, n_head, n_tokens,
+                        ggml_element_size(tmpk) * n_embd_head,
+                        ggml_element_size(tmpk) * n_embd_head * n_head,
+                        0
+                        );
+                cb(krot, "krot", il);
+
+                // get the second half of tmpq, e.g tmpq[n_rot:, :, :]
+                struct ggml_tensor * qpass = ggml_view_3d(
+                        ctx0, tmpq, n_rot, n_head, n_tokens,
+                        ggml_element_size(tmpq) * n_embd_head,
+                        ggml_element_size(tmpq) * n_embd_head * n_head,
+                        ggml_element_size(tmpq) * n_rot
+                        );
+                cb(qpass, "qpass", il);
+
+                struct ggml_tensor * kpass = ggml_view_3d(
+                        ctx0, tmpk, n_rot, n_head, n_tokens,
+                        ggml_element_size(tmpk) * n_embd_head,
+                        ggml_element_size(tmpk) * n_embd_head * n_head,
+                        ggml_element_size(tmpk) * n_rot
+                        );
+                cb(kpass, "kpass", il);
+
+                struct ggml_tensor * qrotated = ggml_rope_custom(
+                    ctx0, qrot, inp_pos, n_rot, 2, 0, n_orig_ctx,
+                    freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
+                );
+                cb(qrotated, "qrotated", il);
+
+                struct ggml_tensor * krotated = ggml_rope_custom(
+                    ctx0, krot, inp_pos, n_rot, 2, 0, n_orig_ctx,
+                    freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
+                );
+                cb(krotated, "krotated", il);
+
+                // ggml currently only supports concatenation on dim=2
+                // so we need to permute qrot, qpass, concat, then permute back.
+                qrotated = ggml_cont(ctx0, ggml_permute(ctx0, qrotated, 2, 1, 0, 3));
+                cb(qrotated, "qrotated", il);
+
+                krotated = ggml_cont(ctx0, ggml_permute(ctx0, krotated, 2, 1, 0, 3));
+                cb(krotated, "krotated", il);
+
+                qpass = ggml_cont(ctx0, ggml_permute(ctx0, qpass, 2, 1, 0, 3));
+                cb(qpass, "qpass", il);
+
+                kpass = ggml_cont(ctx0, ggml_permute(ctx0, kpass, 2, 1, 0, 3));
+                cb(kpass, "kpass", il);
+
+                struct ggml_tensor * Qcur = ggml_concat(ctx0, qrotated, qpass);
+                cb(Qcur, "Qcur", il);
+
+                struct ggml_tensor * Kcur = ggml_concat(ctx0, krotated, kpass);
+                cb(Kcur, "Kcur", il);
+
+                struct ggml_tensor * Q = ggml_cont(ctx0, ggml_permute(ctx0, Qcur, 1, 2, 0, 3));
+                cb(Q, "Q", il);
+
+                Kcur = ggml_cont(ctx0, ggml_permute(ctx0, Kcur, 2, 1, 0, 3));
+                cb(Kcur, "Kcur", il);
+
+                struct ggml_tensor * Vcur = ggml_view_3d(
                         ctx0, tmpqkv_perm, n_embd_head, n_head, n_tokens,
                         ggml_element_size(tmpqkv_perm) * n_embd_head,
                         ggml_element_size(tmpqkv_perm) * n_embd_head * n_head,
                         ggml_element_size(tmpqkv_perm) * n_embd_head * n_head * n_tokens * 2
-                    );
-                offload_func_v(tmpv);
-                // store K, V in cache
-                struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, tmpv, n_embd_gqa, n_tokens));
-                offload_func_v(Vcur);
-                ggml_set_name(Vcur, "Vcur");
+                        );
+                cb(Vcur, "Vcur", il);
 
-                struct ggml_tensor * k = ggml_view_1d(
-                    ctx0, kv_self.k, n_tokens*n_embd_gqa,
-                    (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_head)
-                );
-                offload_func_kq(k);
-                ggml_set_name(k, "k");
+                llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
 
-                struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, n_tokens, n_embd_gqa,
-                        (   n_ctx)*ggml_element_size(kv_self.v),
-                        (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*ggml_element_size(kv_self.v));
-                offload_func_v(v);
-                ggml_set_name(v, "v");
-
-                // important: storing RoPE-ed version of K in the KV cache!
-                ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
-                ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
+                // TODO: not tested, could be broken
+                cur = llm_build_kqv(ctx0, hparams, kv_self,
+                        model.layers[il].wo, model.layers[il].bo,
+                        Q, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, cb, il);
+                cb(cur, "kqv_out", il);
             }
-            struct ggml_tensor * K = ggml_view_3d(ctx0, kv_self.k,
-                    n_embd_head, n_kv, n_head_kv,
-                    ggml_element_size(kv_self.k)*n_embd_gqa,
-                    ggml_element_size(kv_self.k)*n_embd_head,
-                    ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il);
 
-            offload_func_kq(K);
-            ggml_format_name(K, "K_%d", il);
+            struct ggml_tensor * ffn_inp = ggml_add(ctx0, residual, cur);
+            cb(ffn_inp, "ffn_inp", il);
 
-            struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
-            offload_func_kq(KQ);
-            ggml_set_name(KQ, "KQ");
-
-            struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, KQ_scale);
-            offload_func_kq(KQ_scaled);
-            ggml_set_name(KQ_scaled, "KQ_scaled");
-
-            struct ggml_tensor * KQ_masked = ggml_add(ctx0, KQ_scaled, KQ_mask);
-            offload_func_kq(KQ_masked);
-            ggml_set_name(KQ_masked, "KQ_masked");
-
-            struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);
-            offload_func_kq(KQ_soft_max);
-            ggml_set_name(KQ_soft_max, "KQ_soft_max");
-
-            struct ggml_tensor * V =
-                ggml_view_3d(ctx0, kv_self.v,
-                        n_kv, n_embd_head, n_head_kv,
-                        ggml_element_size(kv_self.v)*n_ctx,
-                        ggml_element_size(kv_self.v)*n_ctx*n_embd_head,
-                        ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il);
-            offload_func_v(V);
-            ggml_set_name(V, "V");
-
-            struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
-            offload_func_v(KQV);
-            ggml_set_name(KQV, "KQV");
-
-            struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
-            offload_func_v(KQV_merged);
-            ggml_set_name(KQV_merged, "KQV_merged");
-
-            cur = ggml_cont_2d(ctx0, KQV_merged, n_embd, n_tokens);
-            offload_func_v(cur);
-            ggml_set_name(cur, "KQV_merged_contiguous");
-
-            cur = ggml_mul_mat(ctx0, model.layers[il].wo, cur);
-            offload_func(cur);
-            cur = ggml_add(ctx0, cur, model.layers[il].bo);
-            offload_func(cur);
-            ggml_set_name(cur, "result_wo");
-        }
-
-        struct ggml_tensor * inpFF = ggml_add(ctx0, residual, cur);
-        offload_func(inpFF);
-        ggml_set_name(inpFF, "inpFF");
-        {
-            // MLP
+            // feed-forward network
             {
-                // Norm
-                cur = ggml_norm(ctx0, inpFF, norm_eps);
-                offload_func(cur);
-                cur = ggml_add(ctx0,
-                    ggml_mul(ctx0, cur, model.layers[il].ffn_norm),
-                    model.layers[il].ffn_norm_b
-                );
-                ggml_set_name(cur, "ffn_norm");
-                offload_func(cur);
+                cur = llm_build_norm(ctx0, ffn_inp, hparams,
+                        model.layers[il].ffn_norm,
+                        model.layers[il].ffn_norm_b,
+                        LLM_NORM, cb, il);
+                cb(cur, "ffn_norm", il);
+
+                cur = llm_build_ffn(ctx0, cur,
+                        model.layers[il].ffn_up,   model.layers[il].ffn_up_b,
+                        NULL,                      NULL,
+                        model.layers[il].ffn_down, model.layers[il].ffn_down_b,
+                        LLM_FFN_RELU_SQR, LLM_FFN_SEQ, cb, il);
+                cb(cur, "ffn_out", il);
             }
-            cur = ggml_mul_mat(ctx0, model.layers[il].w3, cur);
-            offload_func(cur);
 
-            cur = ggml_add(ctx0, cur, model.layers[il].b3);
-            offload_func(cur);
-            ggml_set_name(cur, "result_ffn_up");
+            cur = ggml_add(ctx0, cur, ffn_inp);
+            cb(cur, "l_out", il);
 
-            cur = ggml_sqr(ctx0, ggml_relu(ctx0, cur));
-            ggml_set_name(cur, "result_ffn_act");
-            offload_func(cur);
-            offload_func(cur->src[0]);
-
-            cur = ggml_mul_mat(ctx0, model.layers[il].w2, cur);
-            offload_func(cur);
-            cur = ggml_add(ctx0,
-                cur,
-                model.layers[il].b2);
-            offload_func(cur);
-            ggml_set_name(cur, "outFF");
-        }
-        cur = ggml_add(ctx0, cur, inpFF);
-        offload_func(cur);
-        ggml_set_name(cur, "inpFF_+_outFF");
-        inpL = cur;
-    }
-    cur = inpL;
-    {
-        cur = ggml_norm(ctx0, cur, norm_eps);
-        offload_func_nr(cur);
-        cur = ggml_mul(ctx0, cur, model.output_norm);
-        offload_func_nr(cur);
-
-        cur = ggml_add(ctx0, cur, model.output_norm_b);
-        // offload_func_nr(cur);
-
-        ggml_set_name(cur, "result_norm");
-    }
-    cur = ggml_mul_mat(ctx0, model.output, cur);
-    ggml_set_name(cur, "result_output");
-    ggml_build_forward_expand(gf, cur);
-    ggml_free(ctx0);
-    return gf;
-}
-
-static struct ggml_cgraph * llm_build_bloom(
-         llama_context & lctx,
-     const llama_batch & batch) {
-    const auto & model   = lctx.model;
-    const auto & hparams = model.hparams;
-    const auto & cparams = lctx.cparams;
-
-    const auto & kv_self = lctx.kv_self;
-
-    GGML_ASSERT(!!kv_self.ctx);
-
-    const int64_t n_embd      = hparams.n_embd;
-    const int64_t n_layer     = hparams.n_layer;
-    const int64_t n_ctx       = cparams.n_ctx;
-    const int64_t n_head      = hparams.n_head;
-    const int64_t n_head_kv   = hparams.n_head_kv;
-    const int64_t n_embd_head = hparams.n_embd_head();
-    const int64_t n_embd_gqa  = hparams.n_embd_gqa();
-
-    GGML_ASSERT(n_embd_head == hparams.n_rot);
-
-    const float norm_eps = hparams.f_norm_eps;
-
-    const int32_t n_tokens = batch.n_tokens;
-    const int32_t n_kv     = ggml_allocr_is_measure(lctx.alloc) ? n_ctx            : kv_self.n;
-    const int32_t kv_head  = ggml_allocr_is_measure(lctx.alloc) ? n_ctx - n_tokens : kv_self.head;
-
-    auto & buf_compute = lctx.buf_compute;
-
-    struct ggml_init_params params = {
-        /*.mem_size   =*/ buf_compute.size,
-        /*.mem_buffer =*/ buf_compute.data,
-        /*.no_alloc   =*/ false,
-    };
-
-    params.no_alloc = true;
-
-    struct ggml_context * ctx0 = ggml_init(params);
-
-    ggml_cgraph * gf = ggml_new_graph(ctx0);
-
-    struct ggml_tensor * cur;
-    struct ggml_tensor * token;
-    struct ggml_tensor * inpL;
-
-    if (batch.token) {
-        struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
-
-        ggml_allocr_alloc(lctx.alloc, inp_tokens);
-        if (!ggml_allocr_is_measure(lctx.alloc)) {
-            memcpy(inp_tokens->data, batch.token, n_tokens*ggml_element_size(inp_tokens));
-        }
-        ggml_set_name(inp_tokens, "inp_tokens");
-
-        token = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens);
-    } else {
-#ifdef GGML_USE_MPI
-        GGML_ASSERT(false && "not implemented");
-#endif
-
-        token = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens);
-
-        ggml_allocr_alloc(lctx.alloc, token);
-        if (!ggml_allocr_is_measure(lctx.alloc)) {
-            memcpy(token->data, batch.embd, n_tokens * n_embd * ggml_element_size(token));
-        }
-    }
-
-    // KQ_scale
-    struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
-    ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
-    ggml_allocr_alloc(lctx.alloc, KQ_scale);
-    if (!ggml_allocr_is_measure(lctx.alloc)) {
-        ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head));
-    }
-
-    // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
-    struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
-    ggml_set_name(KQ_mask, "KQ_mask");
-    ggml_allocr_alloc(lctx.alloc, KQ_mask);
-    if (!ggml_allocr_is_measure(lctx.alloc)) {
-        float * data = (float *) KQ_mask->data;
-        memset(data, 0, ggml_nbytes(KQ_mask));
-
-        for (int h = 0; h < 1; ++h) {
-            for (int j = 0; j < n_tokens; ++j) {
-                const llama_pos    pos    = batch.pos[j];
-                const llama_seq_id seq_id = batch.seq_id[j];
-
-                for (int i = 0; i < n_kv; ++i) {
-                    if (!kv_self.cells[i].has_seq_id(seq_id) || kv_self.cells[i].pos > pos) {
-                        data[h*(n_kv*n_tokens) + j*n_kv + i] = -INFINITY;
-                    }
-                }
-            }
-        }
-    }
-
-    // norm
-    {
-        inpL = ggml_norm(ctx0, token, norm_eps);
-        inpL = ggml_add(ctx0, ggml_mul(ctx0, inpL, model.tok_norm), model.tok_norm_b);
-    }
-
-    ggml_set_name(inpL, "inpL");
-
-    for (int il = 0; il < n_layer; ++il) {
-        {
-            // Norm
-            cur = ggml_norm(ctx0, inpL, norm_eps);
-            cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.layers[il].attn_norm), model.layers[il].attn_norm_b);
+            inpL = cur;
         }
 
-        {
-            // Self Attention
-            cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wqkv, cur), model.layers[il].bqkv);
+        cur = inpL;
 
-            struct ggml_tensor * tmpq = ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*n_embd);
-            struct ggml_tensor * tmpk = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], sizeof(float)*n_embd);
-            struct ggml_tensor * tmpv = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], sizeof(float)*(n_embd + n_embd_gqa));
+        cur = llm_build_norm(ctx0, cur, hparams,
+                model.output_norm,
+                model.output_norm_b,
+                LLM_NORM, cb, -1);
+        cb(cur, "result_norm", -1);
 
-            struct ggml_tensor * Qcur = tmpq;
-            struct ggml_tensor * Kcur = tmpk;
+        cur = ggml_mul_mat(ctx0, model.output, cur);
+        cb(cur, "result_output", -1);
 
-            // store key and value to memory
+        ggml_build_forward_expand(gf, cur);
+
+        return gf;
+    }
+
+    struct ggml_cgraph * build_refact() {
+        struct ggml_cgraph * gf = ggml_new_graph(ctx0);
+
+        struct ggml_tensor * cur;
+        struct ggml_tensor * inpL;
+
+        inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb);
+        cb(inpL, "inp_embd", -1);
+
+        // KQ_scale
+        struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
+        cb(KQ_scale, "KQ_scale", -1);
+
+        // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
+        struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
+        cb(KQ_mask, "KQ_mask", -1);
+
+        for (int il = 0; il < n_layer; ++il) {
+            struct ggml_tensor * inpSA = inpL;
+
+            cur = llm_build_norm(ctx0, inpL, hparams,
+                    model.layers[il].attn_norm, NULL,
+                    LLM_NORM_RMS, cb, il);
+            cb(cur, "attn_norm", il);
+
+            // self-attention
             {
-                struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, ggml_cont(ctx0, tmpv), n_embd_gqa, n_tokens));
-                ggml_set_name(Vcur, "Vcur");
+                struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
+                cb(Qcur, "Qcur", il);
 
-                struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, n_tokens*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_head));
-                ggml_set_name(k, "k");
+                struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
+                cb(Kcur, "Kcur", il);
 
-                struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, n_tokens, n_embd_gqa,
-                        (   n_ctx)*ggml_element_size(kv_self.v),
-                        (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*ggml_element_size(kv_self.v));
+                struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
+                cb(Vcur, "Vcur", il);
 
-                ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
-                ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
+                Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
+                cb(Kcur, "Kcur", il);
+
+                Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head,    n_tokens);
+                cb(Qcur, "Qcur", il);
+
+                llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
+
+                cur = llm_build_kqv(ctx0, hparams, kv_self,
+                        model.layers[il].wo, NULL,
+                        Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, 8.0f, cb, il);
+                cb(cur, "kqv_out", il);
             }
 
-            struct ggml_tensor * Q =
-                ggml_permute(ctx0,
-                        ggml_cpy(ctx0,
-                            Qcur,
-                            ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd_head, n_head, n_tokens)),
-                        0, 2, 1, 3);
-            ggml_set_name(Q, "Q");
+            struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+            cb(ffn_inp, "ffn_inp", il);
 
-            struct ggml_tensor * K =
-                ggml_view_3d(ctx0, kv_self.k,
-                        n_embd_head, n_kv, n_head_kv,
-                        ggml_element_size(kv_self.k)*n_embd_gqa,
-                        ggml_element_size(kv_self.k)*n_embd_head,
-                        ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il);
-            ggml_set_name(K, "K");
-
-            // K * Q
-            struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
-            ggml_set_name(KQ, "KQ");
-
-            // KQ_scaled = KQ / sqrt(n_embd_head)
-            // KQ_scaled shape [n_past + n_tokens, n_tokens, n_head, 1]
-            struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, KQ_scale);
-            ggml_set_name(KQ_scaled, "KQ_scaled");
-
-            struct ggml_tensor * KQ_scaled_alibi = ggml_alibi(ctx0, KQ_scaled, /*n_past*/ kv_head, n_head, 8);
-            ggml_set_name(KQ_scaled_alibi, "KQ_scaled_alibi");
-
-            // KQ_masked = mask_past(KQ_scaled)
-            struct ggml_tensor * KQ_masked = ggml_add(ctx0, KQ_scaled_alibi, KQ_mask);
-            ggml_set_name(KQ_masked, "KQ_masked");
-
-            // KQ = soft_max(KQ_masked)
-            struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);
-            ggml_set_name(KQ_soft_max, "KQ_soft_max");
-
-            // split cached V into n_head heads
-            struct ggml_tensor * V =
-                ggml_view_3d(ctx0, kv_self.v,
-                        n_kv, n_embd_head, n_head_kv,
-                        ggml_element_size(kv_self.v)*n_ctx,
-                        ggml_element_size(kv_self.v)*n_ctx*n_embd_head,
-                        ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il);
-            ggml_set_name(V, "V");
-
-            struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
-            ggml_set_name(KQV, "KQV");
-
-            // KQV_merged = KQV.permute(0, 2, 1, 3)
-            struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
-            ggml_set_name(KQV_merged, "KQV_merged");
-
-            // cur = KQV_merged.contiguous().view(n_embd, n_tokens)
-            cur = ggml_cont_2d(ctx0, KQV_merged, n_embd, n_tokens);
-            ggml_set_name(cur, "KQV_merged_contiguous");
-        }
-
-        // Projection
-        cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wo, cur), model.layers[il].bo);
-
-        // Add the input
-        cur = ggml_add(ctx0, cur, inpL);
-
-        struct ggml_tensor * inpFF = cur;
-
-        // FF
-        {
-            // Norm
+            // feed-forward network
             {
-                cur = ggml_norm(ctx0, inpFF, norm_eps);
-                cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.layers[il].ffn_norm), model.layers[il].ffn_norm_b);
+                cur = llm_build_norm(ctx0, ffn_inp, hparams,
+                        model.layers[il].ffn_norm, NULL,
+                        LLM_NORM_RMS, cb, il);
+                cb(cur, "ffn_norm", il);
+
+                cur = llm_build_ffn(ctx0, cur,
+                        model.layers[il].ffn_up,   NULL,
+                        model.layers[il].ffn_gate, NULL,
+                        model.layers[il].ffn_down, NULL,
+                        LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
+                cb(cur, "ffn_out", il);
             }
 
-            cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].w3, cur), model.layers[il].b3);
+            cur = ggml_add(ctx0, cur, ffn_inp);
+            cb(cur, "l_out", il);
 
-            // GELU activation
-            cur = ggml_gelu(ctx0, cur);
-
-            // Projection
-            cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].w2, cur), model.layers[il].b2);
+            // input for next layer
+            inpL = cur;
         }
 
-        inpL = ggml_add(ctx0, cur, inpFF);
+        cur = inpL;
+
+        cur = llm_build_norm(ctx0, cur, hparams,
+                model.output_norm, NULL,
+                LLM_NORM_RMS, cb, -1);
+        cb(cur, "result_norm", -1);
+
+        // lm_head
+        cur = ggml_mul_mat(ctx0, model.output, cur);
+        cb(cur, "result_output", -1);
+
+        ggml_build_forward_expand(gf, cur);
+
+        return gf;
     }
 
-    // Output Norm
-    {
-        cur = ggml_norm(ctx0, inpL, norm_eps);
-        cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.output_norm), model.output_norm_b);
-    }
-    ggml_set_name(cur, "result_norm");
+    struct ggml_cgraph * build_bloom() {
+        struct ggml_cgraph * gf = ggml_new_graph(ctx0);
 
-    cur = ggml_mul_mat(ctx0, model.output, cur);
-    ggml_set_name(cur, "result_output");
+        struct ggml_tensor * cur;
+        struct ggml_tensor * inpL;
 
-    ggml_build_forward_expand(gf, cur);
+        inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb);
+        cb(inpL, "inp_embd", -1);
 
-    ggml_free(ctx0);
+        // KQ_scale
+        struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
+        cb(KQ_scale, "KQ_scale", -1);
 
-    return gf;
-}
+        // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
+        struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
+        cb(KQ_mask, "KQ_mask", -1);
 
-static struct ggml_cgraph * llm_build_mpt(
-         llama_context & lctx,
-     const llama_batch & batch) {
-    const auto & model   = lctx.model;
-    const auto & hparams = model.hparams;
-    const auto & cparams = lctx.cparams;
+        inpL = llm_build_norm(ctx0, inpL, hparams,
+                model.tok_norm,
+                model.tok_norm_b,
+                LLM_NORM, cb, -1);
+        cb(inpL, "inp_norm", -1);
 
-    const auto & kv_self = lctx.kv_self;
+        for (int il = 0; il < n_layer; ++il) {
+            cur = llm_build_norm(ctx0, inpL, hparams,
+                    model.layers[il].attn_norm,
+                    model.layers[il].attn_norm_b,
+                    LLM_NORM, cb, il);
+            cb(cur, "attn_norm", il);
 
-    GGML_ASSERT(!!kv_self.ctx);
+            // self-attention
+            {
+                cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
+                cb(cur, "wqkv", il);
 
-    const int64_t n_embd      = hparams.n_embd;
-    const int64_t n_layer     = hparams.n_layer;
-    const int64_t n_ctx       = cparams.n_ctx;
-    const int64_t n_head      = hparams.n_head;
-    const int64_t n_head_kv   = hparams.n_head_kv; // == n_head for MPT, as there's no MQA/GQA
-    const int64_t n_embd_head = hparams.n_embd_head();
-    const int64_t n_embd_gqa  = hparams.n_embd_gqa();
+                cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
+                cb(cur, "bqkv", il);
 
-    const float norm_eps       = hparams.f_norm_eps;
-    const float clamp_kqv      = hparams.f_clamp_kqv;
-    const float max_alibi_bias = hparams.f_max_alibi_bias;
+                struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd,     n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
+                struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
+                struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
 
-    const int n_gpu_layers = model.n_gpu_layers;
+                cb(Qcur, "Qcur", il);
+                cb(Kcur, "Kcur", il);
+                cb(Vcur, "Vcur", il);
 
-    const int32_t n_tokens = batch.n_tokens;
-    const int32_t n_kv     = ggml_allocr_is_measure(lctx.alloc) ? n_ctx            : kv_self.n;
-    const int32_t kv_head  = ggml_allocr_is_measure(lctx.alloc) ? n_ctx - n_tokens : kv_self.head;
+                Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
 
-    auto & buf_compute = lctx.buf_compute;
+                llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
 
-    struct ggml_init_params params = {
-        /*.mem_size   =*/ buf_compute.size,
-        /*.mem_buffer =*/ buf_compute.data,
-        /*.no_alloc   =*/ false,
-    };
-
-    params.no_alloc = true;
-
-    struct ggml_context * ctx0 = ggml_init(params);
-
-    ggml_cgraph * gf = ggml_new_graph(ctx0);
-
-    struct ggml_tensor * cur;
-    struct ggml_tensor * inpL;
-
-    //int warmup = 0;
-    if (batch.token) {
-        struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
-
-        ggml_allocr_alloc(lctx.alloc, inp_tokens);
-        if (!ggml_allocr_is_measure(lctx.alloc)) {
-            memcpy(inp_tokens->data, batch.token, n_tokens*ggml_element_size(inp_tokens));
-            //warmup = ((uint32_t*) inp_tokens->data)[0] == 0;
-        }
-
-        ggml_set_name(inp_tokens, "inp_tokens");
-
-        inpL = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens);
-    } else {
-#ifdef GGML_USE_MPI
-        GGML_ASSERT(false && "not implemented");
-#endif
-
-        inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens);
-
-        ggml_allocr_alloc(lctx.alloc, inpL);
-        if (!ggml_allocr_is_measure(lctx.alloc)) {
-            memcpy(inpL->data, batch.embd, n_tokens * n_embd * ggml_element_size(inpL));
-        }
-    }
-
-    const int i_gpu_start = n_layer - n_gpu_layers;
-    (void) i_gpu_start;
-
-    // offload functions set the tensor output backend to GPU
-    // tensors are GPU-accelerated if any input or the output has been offloaded
-    offload_func_t offload_func_nr = llama_nop; // nr = non-repeating
-    offload_func_t offload_func_kq = llama_nop;
-    offload_func_t offload_func_v  = llama_nop;
-
-#ifdef GGML_USE_CUBLAS
-    if (n_gpu_layers > n_layer) {
-        offload_func_nr = ggml_cuda_assign_buffers_no_alloc;
-    }
-    if (n_gpu_layers > n_layer + 1) {
-        offload_func_v  = ggml_cuda_assign_buffers_no_alloc;
-    }
-    if (n_gpu_layers > n_layer + 2) {
-        offload_func_kq = ggml_cuda_assign_buffers_no_alloc;
-    }
-#endif // GGML_USE_CUBLAS
-
-    // KQ_scale
-    struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
-    ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
-    ggml_allocr_alloc(lctx.alloc, KQ_scale);
-    if (!ggml_allocr_is_measure(lctx.alloc)) {
-        ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head));
-    }
-
-    // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
-    struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
-    offload_func_kq(KQ_mask);
-    ggml_set_name(KQ_mask, "KQ_mask");
-    ggml_allocr_alloc(lctx.alloc, KQ_mask);
-    if (!ggml_allocr_is_measure(lctx.alloc)) {
-        float * data = (float *) KQ_mask->data;
-        memset(data, 0, ggml_nbytes(KQ_mask));
-
-        for (int h = 0; h < 1; ++h) {
-            for (int j = 0; j < n_tokens; ++j) {
-                const llama_pos    pos    = batch.pos[j];
-                const llama_seq_id seq_id = batch.seq_id[j];
-
-                for (int i = 0; i < n_kv; ++i) {
-                    if (!kv_self.cells[i].has_seq_id(seq_id) || kv_self.cells[i].pos > pos) {
-                        data[h*(n_kv*n_tokens) + j*n_kv + i] = -INFINITY;
-                    }
-                }
+                cur = llm_build_kqv(ctx0, hparams, kv_self,
+                        model.layers[il].wo, model.layers[il].bo,
+                        Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, 8.0f, cb, il);
+                cb(cur, "kqv_out", il);
             }
+
+            // Add the input
+            struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
+            cb(ffn_inp, "ffn_inp", il);
+
+            // FF
+            {
+                cur = llm_build_norm(ctx0, ffn_inp, hparams,
+                        model.layers[il].ffn_norm,
+                        model.layers[il].ffn_norm_b,
+                        LLM_NORM, cb, il);
+                cb(cur, "ffn_norm", il);
+
+                cur = llm_build_ffn(ctx0, cur,
+                        model.layers[il].ffn_up,   model.layers[il].ffn_up_b,
+                        NULL,                      NULL,
+                        model.layers[il].ffn_down, model.layers[il].ffn_down_b,
+                        LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
+                cb(cur, "ffn_out", il);
+            }
+
+            inpL = ggml_add(ctx0, cur, ffn_inp);
+            cb(inpL, "l_out", il);
         }
+
+        cur = llm_build_norm(ctx0, inpL, hparams,
+                model.output_norm,
+                model.output_norm_b,
+                LLM_NORM, cb, -1);
+        cb(cur, "result_norm", -1);
+
+        cur = ggml_mul_mat(ctx0, model.output, cur);
+        cb(cur, "result_output", -1);
+
+        ggml_build_forward_expand(gf, cur);
+
+        return gf;
     }
 
-    for (int il = 0; il < n_layer; ++il) {
-        struct ggml_tensor * attn_norm;
+    struct ggml_cgraph * build_mpt() {
+        struct ggml_cgraph * gf = ggml_new_graph(ctx0);
 
-        offload_func_t offload_func = llama_nop;
+        struct ggml_tensor * cur;
+        struct ggml_tensor * inpL;
 
-#ifdef GGML_USE_CUBLAS
-        if (il >= i_gpu_start) {
-            offload_func = ggml_cuda_assign_buffers_no_alloc;
-        }
-#endif // GGML_USE_CUBLAS
+        inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb);
+        cb(inpL, "inp_embd", -1);
 
-        // self-attention
-        // TODO: refactor into common function (shared with LLaMA)
-        {
-            attn_norm = ggml_norm(ctx0, inpL, norm_eps);
-            offload_func(attn_norm);
+        // KQ_scale
+        struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
+        cb(KQ_scale, "KQ_scale", -1);
 
-            attn_norm = ggml_mul(ctx0, attn_norm, model.layers[il].attn_norm);
-            offload_func(attn_norm);
+        // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
+        struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
+        cb(KQ_mask, "KQ_mask", -1);
 
-            if (1) {
+        for (int il = 0; il < n_layer; ++il) {
+            struct ggml_tensor * attn_norm;
+
+            attn_norm = llm_build_norm(ctx0, inpL, hparams,
+                    model.layers[il].attn_norm,
+                    NULL,
+                    LLM_NORM, cb, il);
+            cb(attn_norm, "attn_norm", il);
+
+            // self-attention
+            {
                 cur = attn_norm;
+
+                cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
+                cb(cur, "wqkv", il);
+
+                if (hparams.f_clamp_kqv > 0.0f) {
+                    cur = ggml_clamp(ctx0, cur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
+                    cb(cur, "wqkv_clamped", il);
+                }
+
+                struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd,     n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
+                struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
+                struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
+
+                cb(Qcur, "Qcur", il);
+                cb(Kcur, "Kcur", il);
+                cb(Vcur, "Vcur", il);
+
+                Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
+
+                llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
+
+                cur = llm_build_kqv(ctx0, hparams, kv_self,
+                        model.layers[il].wo, NULL,
+                        Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, hparams.f_max_alibi_bias, cb, il);
+                cb(cur, "kqv_out", il);
             }
 
-            // compute QKV
-
-            cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
-            offload_func_kq(cur);
-
-            if (clamp_kqv > 0.0f) {
-                cur = ggml_clamp(ctx0, cur, -clamp_kqv, clamp_kqv);
-                offload_func_kq(cur);
-            }
-
-            const size_t wsize = ggml_type_size(cur->type);
-
-            struct ggml_tensor * Qcur = ggml_view_3d(
-                ctx0, cur, n_embd_head, n_head, n_tokens,
-                wsize * n_embd_head,
-                wsize * n_embd_head * (n_head + 2 * n_head_kv),
-                0);
-            offload_func_kq(Qcur);
-
-            struct ggml_tensor * Kcur = ggml_view_3d(
-                ctx0, cur, n_embd_head, n_head_kv, n_tokens,
-                wsize * n_embd_head,
-                wsize * n_embd_head * (n_head + 2 * n_head_kv),
-                wsize * n_embd_head *  n_head);
-            offload_func_kq(Kcur);
-
-            struct ggml_tensor * tmpv = ggml_view_3d(
-                ctx0, cur, n_embd_head, n_head_kv, n_tokens,
-                wsize * n_embd_head,
-                wsize * n_embd_head * (n_head + 2 * n_head_kv),
-                wsize * n_embd_head * (n_head +     n_head_kv));
-            offload_func_kq(Kcur);
-
-            ggml_set_name(Qcur, "Qcur");
-            ggml_set_name(Kcur, "Kcur");
+            // Add the input
+            struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
+            cb(ffn_inp, "ffn_inp", il);
 
+            // feed forward
             {
-                struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, ggml_cont(ctx0, tmpv), n_embd_gqa, n_tokens));
-                offload_func_v(Vcur);
-                offload_func_v(Vcur->src[0]->src[0]);
-                ggml_set_name(Vcur, "Vcur");
+                cur = llm_build_norm(ctx0, ffn_inp, hparams,
+                        model.layers[il].ffn_norm,
+                        NULL,
+                        LLM_NORM, cb, il);
+                cb(cur, "ffn_norm", il);
 
-                struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, n_tokens*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_head));
-                offload_func_kq(k);
-                ggml_set_name(k, "k");
-
-                struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, n_tokens, n_embd_gqa,
-                        (   n_ctx)*ggml_element_size(kv_self.v),
-                        (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*ggml_element_size(kv_self.v));
-                offload_func_v(v);
-
-                ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
-                ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
+                cur = llm_build_ffn(ctx0, cur,
+                        model.layers[il].ffn_up,   NULL,
+                        NULL,                      NULL,
+                        model.layers[il].ffn_down, NULL,
+                        LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
+                cb(cur, "ffn_out", il);
             }
 
-            struct ggml_tensor * Q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3);
-            offload_func_kq(Q);
-            ggml_set_name(Q, "Q");
+            cur = ggml_add(ctx0, cur, ffn_inp);
+            cb(cur, "l_out", il);
 
-            struct ggml_tensor * K =
-                ggml_view_3d(ctx0, kv_self.k,
-                        n_embd_head, n_kv, n_head_kv,
-                        ggml_element_size(kv_self.k)*n_embd_gqa,
-                        ggml_element_size(kv_self.k)*n_embd_head,
-                        ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il);
-            offload_func_kq(K);
-            ggml_set_name(K, "K");
-
-            struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
-            offload_func_kq(KQ);
-            ggml_set_name(KQ, "KQ");
-
-            struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, KQ_scale);
-            offload_func_kq(KQ_scaled);
-            ggml_set_name(KQ_scaled, "KQ_scaled");
-
-            // TODO: replace with ggml_add()
-            struct ggml_tensor * KQ_scaled_alibi =
-                ggml_alibi(ctx0, KQ_scaled, 0, n_head, max_alibi_bias);
-            offload_func_kq(KQ_scaled_alibi);
-            ggml_set_name(KQ_scaled_alibi, "KQ_scaled_alibi");
-
-            struct ggml_tensor * KQ_masked = ggml_add(ctx0, KQ_scaled_alibi, KQ_mask);
-            offload_func_kq(KQ_masked);
-            ggml_set_name(KQ_masked, "KQ_masked");
-
-            struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
-            offload_func_v(KQ_soft_max);
-            ggml_set_name(KQ_soft_max, "KQ_soft_max");
-
-            struct ggml_tensor * V =
-                ggml_view_3d(ctx0, kv_self.v,
-                        n_kv, n_embd_head, n_head_kv,
-                        ggml_element_size(kv_self.v)*n_ctx,
-                        ggml_element_size(kv_self.v)*n_ctx*n_embd_head,
-                        ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il);
-            offload_func_v(V);
-            ggml_set_name(V, "V");
-
-            struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
-            offload_func_v(KQV);
-            ggml_set_name(KQV, "KQV");
-
-            struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
-            offload_func_v(KQV_merged);
-            ggml_set_name(KQV_merged, "KQV_merged");
-
-            cur = ggml_cont_2d(ctx0, KQV_merged, n_embd, n_tokens);
-            offload_func_v(cur);
-            ggml_set_name(cur, "KQV_merged_contiguous");
-
-            cur = ggml_mul_mat(ctx0, model.layers[il].wo, cur);
-            offload_func(cur);
-            ggml_set_name(cur, "result_wo");
+            // input for next layer
+            inpL = cur;
         }
 
-        // Add the input
-        cur = ggml_add(ctx0, cur, inpL);
-        offload_func(cur);
+        cur = inpL;
 
-        struct ggml_tensor * attn_out = cur;
+        cur = llm_build_norm(ctx0, cur, hparams,
+                model.output_norm,
+                NULL,
+                LLM_NORM, cb, -1);
+        cb(cur, "result_norm", -1);
 
-        // feed forward
-        {
-            // Norm
-            {
-                cur = ggml_norm(ctx0, attn_out, norm_eps);
-                offload_func(cur);
+        cur = ggml_mul_mat(ctx0, model.output, cur);
+        cb(cur, "result_output", -1);
 
-                cur = ggml_mul(ctx0, cur, model.layers[il].ffn_norm);
-                offload_func(cur);
+        ggml_build_forward_expand(gf, cur);
+
+        return gf;
+    }
+};
+
+//
+// tensor offloading helpers
+//
+// TODO: will be removed with backend v2
+
+enum llm_offload_func_e {
+    OFFLOAD_FUNC_NOP,
+    OFFLOAD_FUNC,
+    OFFLOAD_FUNC_KQ,
+    OFFLOAD_FUNC_V,
+    OFFLOAD_FUNC_NR,
+    OFFLOAD_FUNC_EMB,
+    OFFLOAD_FUNC_OUT,
+};
+
+// TODO: will be removed with backend v2
+struct llm_offload_trie {
+    struct node {
+        ~node() {
+            for (int i = 0; i < 256; ++i) {
+                if (children[i]) {
+                    delete children[i];
+                }
             }
-
-            cur = ggml_mul_mat(ctx0, model.layers[il].w3, cur);
-            offload_func(cur);
-
-            cur = ggml_gelu(ctx0, cur);
-            offload_func(cur);
-            cur = ggml_mul_mat(ctx0, model.layers[il].w2, cur);
-            offload_func(cur);
         }
 
-        cur = ggml_add(ctx0, cur, attn_out);
-        offload_func(cur);
-        // input for next layer
-        inpL = cur;
+        node * children[256] = { nullptr };
+        llm_offload_func_e func = OFFLOAD_FUNC_NOP;
+    };
+
+    llm_offload_trie() {
+        root = new node;
     }
 
-    cur = inpL;
+    llm_offload_trie(const std::unordered_map<const char *, llm_offload_func_e> & map) {
+        root = new node;
 
-    // norm
-    {
-        cur = ggml_norm(ctx0, cur, norm_eps);
-        offload_func_nr(cur);
-
-        cur = ggml_mul(ctx0, cur, model.output_norm);
-        ggml_set_name(cur, "result_norm");
+        for (const auto & kv : map) {
+            add(kv.first, kv.second);
+        }
     }
 
-    cur = ggml_mul_mat(ctx0, model.output, cur);
-    ggml_set_name(cur, "result_output");
+    ~llm_offload_trie() {
+        delete root;
+    }
 
-    ggml_build_forward_expand(gf, cur);
+    void add(const char * name, llm_offload_func_e func) {
+        node * cur = root;
 
-    ggml_free(ctx0);
+        for (int i = 0; ; ++i) {
+            const uint8_t c = name[i];
 
-    return gf;
-}
+            if (!c) {
+                break;
+            }
+
+            if (!cur->children[c]) {
+                cur->children[c] = new node;
+            }
+
+            cur = cur->children[c];
+        }
+
+        cur->func = func;
+    }
+
+    llm_offload_func_e find(const char * name) const {
+        const node * cur = root;
+
+        for (int i = 0; ; ++i) {
+            const uint8_t c = name[i];
+
+            if (!c) {
+                break;
+            }
+
+            if (!cur->children[c]) {
+                return OFFLOAD_FUNC_NOP;
+            }
+
+            cur = cur->children[c];
+        }
+
+        return cur->func;
+    }
+
+    node * root = nullptr;
+};
+
+// TODO: will be removed with backend v2
+static const std::unordered_map<const char *, llm_offload_func_e> k_offload_map = {
+  //{ "inp_tokens",                 OFFLOAD_FUNC_NR  }, // TODO: missing K-quants get_rows kernel
+  //{ "inp_embd",                   OFFLOAD_FUNC_NR  }, // TODO: missing K-quants get_rows kernel
+    { "pos_embd",                   OFFLOAD_FUNC_NR  },
+
+    { "inp_pos",                    OFFLOAD_FUNC_KQ  }, // this is often used for KQ ops (e.g. rope)
+    { "KQ_scale",                   OFFLOAD_FUNC_KQ  },
+    { "KQ_mask",                    OFFLOAD_FUNC_KQ  },
+    { "K_shift",                    OFFLOAD_FUNC_KQ  },
+    { "K_shifted",                  OFFLOAD_FUNC_KQ  },
+
+    { "inp_norm",                   OFFLOAD_FUNC_NR  },
+    { "inp_norm_w",                 OFFLOAD_FUNC_NR  },
+    { "inp_norm_wb",                OFFLOAD_FUNC_NR  },
+
+    { "norm",                       OFFLOAD_FUNC     },
+    { "norm_w",                     OFFLOAD_FUNC     },
+    { "norm_wb",                    OFFLOAD_FUNC     },
+
+    { "attn_norm",                  OFFLOAD_FUNC     },
+    { "attn_norm_2",                OFFLOAD_FUNC     },
+
+    { "wqkv",                       OFFLOAD_FUNC_KQ  },
+    { "bqkv",                       OFFLOAD_FUNC_KQ  },
+    { "wqkv_clamped",               OFFLOAD_FUNC_KQ  },
+
+    { "tmpk",                       OFFLOAD_FUNC_KQ  },
+    { "tmpq",                       OFFLOAD_FUNC_KQ  },
+    { "tmpv",                       OFFLOAD_FUNC_V   },
+    { "Kcur",                       OFFLOAD_FUNC_KQ  },
+    { "Qcur",                       OFFLOAD_FUNC_KQ  },
+    { "Vcur",                       OFFLOAD_FUNC_V   },
+
+    { "krot",                       OFFLOAD_FUNC_KQ  },
+    { "qrot",                       OFFLOAD_FUNC_KQ  },
+    { "kpass",                      OFFLOAD_FUNC_KQ  },
+    { "qpass",                      OFFLOAD_FUNC_KQ  },
+    { "krotated",                   OFFLOAD_FUNC_KQ  },
+    { "qrotated",                   OFFLOAD_FUNC_KQ  },
+
+    { "q",                          OFFLOAD_FUNC_KQ  },
+    { "k",                          OFFLOAD_FUNC_KQ  },
+    { "kq",                         OFFLOAD_FUNC_KQ  },
+    { "kq_scaled",                  OFFLOAD_FUNC_KQ  },
+    { "kq_scaled_alibi",            OFFLOAD_FUNC_KQ  },
+    { "kq_masked",                  OFFLOAD_FUNC_KQ  },
+    { "kq_soft_max",                OFFLOAD_FUNC_V   },
+    { "v",                          OFFLOAD_FUNC_V   },
+    { "kqv",                        OFFLOAD_FUNC_V   },
+    { "kqv_merged",                 OFFLOAD_FUNC_V   },
+    { "kqv_merged_cont",            OFFLOAD_FUNC_V   },
+    { "kqv_wo",                     OFFLOAD_FUNC_V   },
+    { "kqv_out",                    OFFLOAD_FUNC_V   },
+
+    { "ffn_inp",                    OFFLOAD_FUNC     },
+    { "ffn_norm",                   OFFLOAD_FUNC     },
+
+    { "ffn_up",                     OFFLOAD_FUNC     },
+    { "ffn_up_b",                   OFFLOAD_FUNC     },
+    { "ffn_gate",                   OFFLOAD_FUNC     },
+    { "ffn_gate_b",                 OFFLOAD_FUNC     },
+    { "ffn_gate_par",               OFFLOAD_FUNC     },
+    { "ffn_down",                   OFFLOAD_FUNC     },
+    { "ffn_down_b",                 OFFLOAD_FUNC     },
+    { "ffn_out",                    OFFLOAD_FUNC     },
+
+    { "ffn_silu",                   OFFLOAD_FUNC     },
+    { "ffn_gelu",                   OFFLOAD_FUNC     },
+    { "ffn_relu",                   OFFLOAD_FUNC     },
+    { "ffn_sqr(relu)",              OFFLOAD_FUNC     },
+
+    { "l_out",                      OFFLOAD_FUNC     },
+
+    { "result_norm",                OFFLOAD_FUNC_EMB },
+    { "result_output",              OFFLOAD_FUNC_OUT },
+};
+
+static llm_offload_trie k_offload_func_trie(k_offload_map);
 
 static struct ggml_cgraph * llama_build_graph(
          llama_context & lctx,
      const llama_batch & batch) {
     const auto & model = lctx.model;
 
+    // check if we should build the worst-case graph (for memory measurement)
+    const bool worst_case = ggml_allocr_is_measure(lctx.alloc);
+
+    // keep track of the input that has already been allocated
+    bool alloc_inp_tokens   = false;
+    bool alloc_inp_embd     = false;
+    bool alloc_inp_pos      = false;
+    bool alloc_inp_KQ_scale = false;
+    bool alloc_inp_KQ_mask  = false;
+    bool alloc_inp_K_shift  = false;
+
+#ifdef GGML_USE_CUBLAS
+    const bool do_offload = true;
+#else
+    const bool do_offload = true; // TODO: set to false after finishing refactoring
+#endif
+
+    int n_non_view = 0; // number of non-view tensors that have been processed by the callback
+
+    // this callback allows us to apply custom logic to each tensor (e.g. ggml-alloc, offloading, etc.)
+    // TODO: will be removed with backend v2
+    llm_build_cb cb = [&](struct ggml_tensor * cur, const char * name, int il) {
+        if (il >= 0) {
+            ggml_format_name(cur, "%s-%d", name, il);
+        } else {
+            ggml_set_name(cur, name);
+        }
+
+        //
+        // allocate input tensors and set input data
+        //
+        // TODO: will be removed with backend v2
+
+        if (!alloc_inp_tokens && strcmp(name, "inp_tokens") == 0) {
+            ggml_allocr_alloc(lctx.alloc, cur);
+
+            if (!ggml_allocr_is_measure(lctx.alloc) && batch.token) {
+                const int64_t n_tokens = cur->ne[0];
+
+                memcpy(cur->data, batch.token, n_tokens*ggml_element_size(cur));
+            }
+
+            alloc_inp_tokens = true;
+        }
+
+        if (!alloc_inp_embd && strcmp(name, "inp_embd") == 0) {
+            ggml_allocr_alloc(lctx.alloc, cur);
+
+            if (!ggml_allocr_is_measure(lctx.alloc) && batch.embd) {
+                const int64_t n_embd   = cur->ne[0];
+                const int64_t n_tokens = cur->ne[1];
+
+                memcpy(cur->data, batch.embd, n_tokens*n_embd*ggml_element_size(cur));
+            }
+
+            alloc_inp_embd = true;
+        }
+
+        if (!alloc_inp_pos && strcmp(name, "inp_pos") == 0) {
+            ggml_allocr_alloc(lctx.alloc, cur);
+
+            if (!ggml_allocr_is_measure(lctx.alloc) && batch.pos) {
+                const int64_t n_tokens = cur->ne[0];
+
+                int32_t * data = (int32_t *) cur->data;
+
+                for (int i = 0; i < n_tokens; ++i) {
+                    data[i] = batch.pos[i];
+                }
+            }
+
+            alloc_inp_pos = true;
+        }
+
+        if (!alloc_inp_KQ_scale && strcmp(name, "KQ_scale") == 0) {
+            ggml_allocr_alloc(lctx.alloc, cur);
+
+            if (!ggml_allocr_is_measure(lctx.alloc)) {
+                const int64_t n_embd_head = model.hparams.n_embd_head();
+                ggml_set_f32(cur, 1.0f/sqrtf(float(n_embd_head)));
+            }
+
+            alloc_inp_KQ_scale = true;
+        }
+
+        if (!alloc_inp_KQ_mask && strcmp(name, "KQ_mask") == 0) {
+            ggml_allocr_alloc(lctx.alloc, cur);
+
+            if (!ggml_allocr_is_measure(lctx.alloc)) {
+                const int64_t n_kv     = cur->ne[0];
+                const int64_t n_tokens = cur->ne[1];
+
+                float * data = (float *) cur->data;
+                memset(data, 0, ggml_nbytes(cur));
+
+                for (int h = 0; h < 1; ++h) {
+                    for (int j = 0; j < n_tokens; ++j) {
+                        const llama_pos    pos    = batch.pos[j];
+                        const llama_seq_id seq_id = batch.seq_id[j][0];
+
+                        for (int i = 0; i < n_kv; ++i) {
+                            if (!lctx.kv_self.cells[i].has_seq_id(seq_id) || lctx.kv_self.cells[i].pos > pos) {
+                                data[h*(n_kv*n_tokens) + j*n_kv + i] = -INFINITY;
+                            }
+                        }
+                    }
+                }
+            }
+
+            alloc_inp_KQ_mask = true;
+        }
+
+        if (!alloc_inp_K_shift && strcmp(name, "K_shift") == 0) {
+            ggml_allocr_alloc(lctx.alloc, cur);
+
+            if (!ggml_allocr_is_measure(lctx.alloc)) {
+                const int64_t n_ctx = cur->ne[0];
+
+                int32_t * data = (int32_t *) cur->data;
+
+                for (int i = 0; i < n_ctx; ++i) {
+                    data[i] = lctx.kv_self.cells[i].delta;
+                }
+            }
+
+            alloc_inp_K_shift = true;
+        }
+
+        // view tensors are not processed further
+        if (cur->view_src != nullptr) {
+            return;
+        }
+
+        if (cur->op != GGML_OP_NONE) {
+            n_non_view++;
+        }
+
+        //
+        // offload layers
+        //
+        // TODO: will be removed with backend v2
+
+//#define LLAMA_OFFLOAD_DEBUG
+
+        if (!do_offload) {
+            return;
+        }
+
+        const int n_layer = model.hparams.n_layer;
+
+        const int n_gpu_layers = model.n_gpu_layers;
+        const int i_gpu_start  = n_layer - n_gpu_layers;
+
+        // should we offload the final norm? yes if we are not computing embeddings
+        const bool offload_emb = lctx.embedding.empty();
+
+        static const std::unordered_map<llm_offload_func_e, std::string, std::hash<int>> k_offload_func_name = {
+            { OFFLOAD_FUNC_NOP, "CPU" },
+            { OFFLOAD_FUNC_OUT, "CPU" },
+#ifdef GGML_USE_CUBLAS
+            { OFFLOAD_FUNC,     "GPU (CUDA)" },
+            { OFFLOAD_FUNC_KQ,  "GPU (CUDA) KQ" },
+            { OFFLOAD_FUNC_V,   "GPU (CUDA) V" },
+            { OFFLOAD_FUNC_NR,  "GPU (CUDA) NR" },
+            { OFFLOAD_FUNC_EMB, "GPU (CUDA) EMB" },
+#else
+            { OFFLOAD_FUNC,     "CPU" },
+            { OFFLOAD_FUNC_KQ,  "CPU" },
+            { OFFLOAD_FUNC_V,   "CPU" },
+            { OFFLOAD_FUNC_NR,  "CPU" },
+            { OFFLOAD_FUNC_EMB, "CPU" },
+#endif // GGML_USE_CUBLAS
+        };
+
+        // check the global map for what offload function to use for this tensor
+        llm_offload_func_e func_e = k_offload_func_trie.find(name);
+
+        if (func_e == OFFLOAD_FUNC_NOP) {
+#ifdef LLAMA_OFFLOAD_DEBUG
+            // if a tensor hasn't been offloaded, we warn the user
+            if (worst_case) {
+                LLAMA_LOG_WARN("%s: %32s: not offloaded (ref: %s)\n", __func__,
+                        cur->name, "https://github.com/ggerganov/llama.cpp/pull/3837");
+            }
+#endif
+
+            return;
+        }
+
+        // count the number of layers and respect the provided n_gpu_layers
+        switch (func_e) {
+            case OFFLOAD_FUNC_NOP:
+            case OFFLOAD_FUNC_OUT:
+                break;
+            case OFFLOAD_FUNC:
+                if (n_gpu_layers < n_layer) {
+                    if (il < i_gpu_start) {
+                        func_e = OFFLOAD_FUNC_NOP;
+                    }
+                }
+                break;
+            case OFFLOAD_FUNC_NR:
+                if (n_gpu_layers <= n_layer + 0) {
+                    func_e = OFFLOAD_FUNC_NOP;
+                }
+                break;
+            case OFFLOAD_FUNC_V:
+                if (n_gpu_layers <= n_layer + 1) {
+                    func_e = OFFLOAD_FUNC_NOP;
+                }
+                break;
+            case OFFLOAD_FUNC_KQ:
+                if (n_gpu_layers <= n_layer + 2) {
+                    func_e = OFFLOAD_FUNC_NOP;
+                }
+                break;
+            case OFFLOAD_FUNC_EMB:
+                if (!offload_emb || n_gpu_layers < n_layer) {
+                    func_e = OFFLOAD_FUNC_NOP;
+                }
+                break;
+            default: GGML_ASSERT(false);
+        }
+
+        offload_func_t func = ggml_offload_nop;
+
+        // this is needed for compatibility with Metal for example
+#ifdef GGML_USE_CUBLAS
+        static offload_func_t ggml_offload_gpu = ggml_cuda_assign_buffers_no_alloc;
+#else
+        static offload_func_t ggml_offload_gpu = ggml_offload_nop;
+#endif
+
+        switch (func_e) {
+            case OFFLOAD_FUNC_NOP:
+            case OFFLOAD_FUNC_OUT: func = ggml_offload_nop; break;
+            case OFFLOAD_FUNC:
+            case OFFLOAD_FUNC_KQ:
+            case OFFLOAD_FUNC_V:
+            case OFFLOAD_FUNC_NR:
+            case OFFLOAD_FUNC_EMB: func = ggml_offload_gpu; break;
+            default: GGML_ASSERT(false);
+        }
+
+        // apply offload function to the tensor
+        func(cur);
+
+#ifdef LLAMA_OFFLOAD_DEBUG
+        if (worst_case) {
+            LLAMA_LOG_INFO("%s: %32s: %s\n", __func__, cur->name, k_offload_func_name.at(func_e).c_str());
+        }
+#endif
+    };
+
     struct ggml_cgraph * result = NULL;
 
+    struct llm_build_context llm(lctx, batch, cb, worst_case);
+
+    llm.init();
+
     switch (model.arch) {
         case LLM_ARCH_LLAMA:
             {
-                result = llm_build_llama(lctx, batch);
+                result = llm.build_llama();
             } break;
         case LLM_ARCH_BAICHUAN:
             {
-                result = llm_build_baichaun(lctx, batch);
+                result = llm.build_baichuan();
             } break;
         case LLM_ARCH_FALCON:
             {
-                result = llm_build_falcon(lctx, batch);
+                result = llm.build_falcon();
             } break;
         case LLM_ARCH_STARCODER:
             {
-                result = llm_build_starcoder(lctx, batch);
+                result = llm.build_starcoder();
             } break;
         case LLM_ARCH_PERSIMMON:
             {
-                result = llm_build_persimmon(lctx, batch);
+                result = llm.build_persimmon();
             } break;
         case LLM_ARCH_REFACT:
             {
-                result = llm_build_refact(lctx, batch);
+                result = llm.build_refact();
             } break;
         case LLM_ARCH_BLOOM:
             {
-                result = llm_build_bloom(lctx, batch);
+                result = llm.build_bloom();
             } break;
         case LLM_ARCH_MPT:
             {
-                result = llm_build_mpt(lctx, batch);
+                result = llm.build_mpt();
             } break;
         default:
             GGML_ASSERT(false);
     }
 
+    llm.free();
+
+    if (worst_case) {
+        int n_non_view_total = 0;
+
+        for (int i = 0; i < result->n_nodes; ++i) {
+            if (result->nodes[i]->view_src == nullptr) {
+                n_non_view_total++;
+            }
+        }
+
+        LLAMA_LOG_INFO("%s: non-view tensors processed: %d/%d\n", __func__, n_non_view, n_non_view_total);
+
+        if (n_non_view != n_non_view_total) {
+            LLAMA_LOG_WARN("%s: ****************************************************************\n", __func__);
+            LLAMA_LOG_WARN("%s: not all non-view tensors have been processed with a callback\n",     __func__);
+            LLAMA_LOG_WARN("%s: this can indicate an inefficiency in the graph implementation\n",    __func__);
+            LLAMA_LOG_WARN("%s: build with LLAMA_OFFLOAD_DEBUG for more info\n",                     __func__);
+            LLAMA_LOG_WARN("%s: ref: https://github.com/ggerganov/llama.cpp/pull/3837\n",            __func__);
+            LLAMA_LOG_WARN("%s: ****************************************************************\n", __func__);
+        }
+    }
+
     return result;
 }
 
@@ -5721,7 +5028,6 @@ static struct ggml_cgraph * llama_build_graph(
 //
 //   - lctx:      llama context
 //   - batch:     batch to evaluate
-//   - n_threads: number of threads to use
 //
 // return 0 on success
 // return positive int on warning
@@ -5767,8 +5073,11 @@ static int llama_decode_internal(
 
     // helpers for smoother batch API transistion
     // after deprecating the llama_eval calls, these will be removed
-    std::vector<llama_pos>    pos;
-    std::vector<llama_seq_id> seq_id;
+    std::vector<llama_pos> pos;
+
+    std::vector<int32_t>                   n_seq_id;
+    std::vector<llama_seq_id *>            seq_id_arr;
+    std::vector<std::vector<llama_seq_id>> seq_id;
 
     if (batch.pos == nullptr) {
         pos.resize(n_tokens);
@@ -5780,12 +5089,18 @@ static int llama_decode_internal(
     }
 
     if (batch.seq_id == nullptr) {
+        n_seq_id.resize(n_tokens);
         seq_id.resize(n_tokens);
+        seq_id_arr.resize(n_tokens);
         for (uint32_t i = 0; i < n_tokens; i++) {
-            seq_id[i] = batch.all_seq_id;
+            n_seq_id[i] = 1;
+            seq_id[i].resize(1);
+            seq_id[i][0] = batch.all_seq_id;
+            seq_id_arr[i] = seq_id[i].data();
         }
 
-        batch.seq_id = seq_id.data();
+        batch.n_seq_id = n_seq_id.data();
+        batch.seq_id = seq_id_arr.data();
     }
 
     if (!llama_kv_cache_find_slot(kv_self, batch)) {
@@ -5806,6 +5121,13 @@ static int llama_decode_internal(
 
     ggml_allocr_alloc_graph(lctx.alloc, gf);
 
+    struct ggml_tensor * res        = gf->nodes[gf->n_nodes - 1];
+    struct ggml_tensor * embeddings = gf->nodes[gf->n_nodes - 2];
+
+    GGML_ASSERT(strcmp(res->name,        "result_output") == 0);
+    GGML_ASSERT(strcmp(embeddings->name, "result_norm")   == 0);
+
+
 #ifdef GGML_USE_CUBLAS
     for (int i = 0; i < gf->n_leafs; i++) {
         ggml_tensor * node = gf->leafs[i];
@@ -5822,7 +5144,11 @@ static int llama_decode_internal(
         }
     }
 
-    ggml_cuda_set_mul_mat_q(cparams.mul_mat_q);
+    // HACK: ggml-alloc may change the tensor backend when reusing a parent, so force output to be on the CPU here if needed
+    if (!lctx.embedding.empty()) {
+        embeddings->backend = GGML_BACKEND_CPU;
+    }
+    res->backend = GGML_BACKEND_CPU;
 #endif
 
     // LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs);
@@ -5837,22 +5163,18 @@ static int llama_decode_internal(
     }
 
     // If all tensors can be run on the GPU then using more than 1 thread is detrimental.
-    const bool full_offload_supported = model.arch == LLM_ARCH_LLAMA ||
+    const bool full_offload_supported =
+        model.arch == LLM_ARCH_LLAMA    ||
         model.arch == LLM_ARCH_BAICHUAN ||
-        model.arch == LLM_ARCH_FALCON ||
-        model.arch == LLM_ARCH_REFACT ||
+        model.arch == LLM_ARCH_FALCON   ||
+        model.arch == LLM_ARCH_REFACT   ||
         model.arch == LLM_ARCH_MPT;
+
     const bool fully_offloaded = model.n_gpu_layers >= (int) hparams.n_layer + 3;
     if (ggml_cpu_has_cublas() && full_offload_supported && fully_offloaded) {
         n_threads = 1;
     }
 
-    struct ggml_tensor * res        = gf->nodes[gf->n_nodes - 1];
-    struct ggml_tensor * embeddings = gf->nodes[gf->n_nodes - 2];
-
-    GGML_ASSERT(strcmp(res->name,        "result_output") == 0);
-    GGML_ASSERT(strcmp(embeddings->name, "result_norm")   == 0);
-
 #if GGML_USE_MPI
     const int64_t n_layer = hparams.n_layer;
     ggml_mpi_graph_compute_pre(lctx.ctx_mpi, gf, n_layer);
@@ -5874,11 +5196,20 @@ static int llama_decode_internal(
 #endif
 
     // update the kv ring buffer
-    lctx.kv_self.has_shift  = false;
-    lctx.kv_self.head      += n_tokens;
-    // Ensure kv cache head points to a valid index.
-    if (lctx.kv_self.head >= lctx.kv_self.size) {
-        lctx.kv_self.head = 0;
+    {
+        if (kv_self.has_shift) {
+            kv_self.has_shift = false;
+            for (uint32_t i = 0; i < kv_self.size; ++i) {
+                kv_self.cells[i].delta = 0;
+            }
+        }
+
+        kv_self.head += n_tokens;
+
+        // Ensure kv cache head points to a valid index.
+        if (kv_self.head >= kv_self.size) {
+            kv_self.head = 0;
+        }
     }
 
 #ifdef GGML_PERF
@@ -5893,6 +5224,8 @@ static int llama_decode_internal(
     //}
 
     // extract logits
+    // TODO: do not compute and extract logits if only embeddings are needed
+    //       need to update the graphs to skip "result_output"
     {
         auto & logits_out = lctx.logits;
 
@@ -5987,11 +5320,10 @@ static uint8_t llama_token_to_byte(const llama_vocab& vocab, llama_token id) {
 }
 
 static llama_token llama_byte_to_token(const llama_vocab & vocab, uint8_t ch) {
+    static const char * hex = "0123456789ABCDEF";
     switch (llama_vocab_get_type(vocab)) {
     case LLAMA_VOCAB_TYPE_SPM: {
-        char buf[7];
-        int result = snprintf(buf, sizeof(buf), "<0x%02X>", ch);
-        GGML_ASSERT(0 <= result && result < 7);
+        const char buf[7] = { '<', '0', 'x', hex[ch >> 4], hex[ch & 15], '>', 0 };
         return vocab.token_to_id.at(buf);
     }
     case LLAMA_VOCAB_TYPE_BPE: {
@@ -6205,7 +5537,6 @@ struct llm_tokenizer_bpe {
                 llm_symbol sym;
                 size_t char_len = std::min(word.size() - offset, (size_t) ::utf8_len(word[offset]));
                 sym.text = word.c_str() + offset;
-                sym.n = 1;
                 sym.n = char_len;
                 offset += sym.n;
                 sym.prev = index - 1;
@@ -6465,7 +5796,137 @@ private:
     llm_bigram_bpe::queue work_queue;
 };
 
-static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab & vocab, std::string raw_text, bool bos) {
+typedef enum FRAGMENT_BUFFER_VARIANT_TYPE{
+    FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN,
+    FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT
+} FRAGMENT_BUFFER_VARIANT_TYPE;
+
+struct fragment_buffer_variant{
+    fragment_buffer_variant(llama_vocab::id _token)
+    :
+        type(FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN),
+        token(_token),
+        raw_text(_dummy),
+        offset(0),
+        length(0){}
+    fragment_buffer_variant(const std::string & _raw_text, int64_t _offset, int64_t _length)
+    :
+        type(FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT),
+        token((llama_vocab::id)-1),
+        raw_text(_raw_text),
+        offset(_offset),
+        length(_length){
+            GGML_ASSERT( _offset >= 0 );
+            GGML_ASSERT( _length >= 1 );
+            GGML_ASSERT( offset + length <= raw_text.length() );
+        }
+
+    const FRAGMENT_BUFFER_VARIANT_TYPE type;
+    const llama_vocab::id token;
+    const std::string _dummy;
+    const std::string & raw_text;
+    const uint64_t offset;
+    const uint64_t length;
+};
+
+// #define PRETOKENIZERDEBUG
+
+static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<fragment_buffer_variant> & buffer)
+{
+    // for each special token
+    for (const auto & st: vocab.special_tokens_cache) {
+        const auto & special_token = st.first;
+        const auto & special_id    = st.second;
+
+        // for each text fragment
+        std::forward_list<fragment_buffer_variant>::iterator it = buffer.begin();
+        while (it != buffer.end()) {
+            auto & fragment = (*it);
+
+            // if a fragment is text ( not yet processed )
+            if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
+                auto * raw_text = &(fragment.raw_text);
+
+                auto raw_text_base_offset = fragment.offset;
+                auto raw_text_base_length = fragment.length;
+
+                // loop over the text
+                while (true) {
+                    // find the first occurence of a given special token in this fragment
+                    //  passing offset argument only limit the "search area" but match coordinates
+                    //  are still relative to the source full raw_text
+                    auto match = raw_text->find(special_token, raw_text_base_offset);
+
+                    // no occurences found, stop processing this fragment for a given special token
+                    if (match == std::string::npos) break;
+
+                    // check if match is within bounds of offset <-> length
+                    if (match + special_token.length() > raw_text_base_offset + raw_text_base_length) break;
+
+#ifdef PRETOKENIZERDEBUG
+                    fprintf(stderr, "FF: (%ld %ld %ld) '%s'\n", raw_text->length(), raw_text_base_offset, raw_text_base_length, raw_text->substr(raw_text_base_offset, raw_text_base_length).c_str());
+#endif
+                    auto source = std::distance(buffer.begin(), it);
+
+                    // if match is further than base offset
+                    //  then we have some text to the left of it
+                    if (match > raw_text_base_offset) {
+                        // left
+                        const int64_t left_reminder_offset = raw_text_base_offset + 0;
+                        const int64_t left_reminder_length = match - raw_text_base_offset;
+                        buffer.emplace_after(it, (*raw_text), left_reminder_offset, left_reminder_length);
+
+#ifdef PRETOKENIZERDEBUG
+                        fprintf(stderr, "FL: (%ld %ld) '%s'\n", left_reminder_offset, left_reminder_length, raw_text->substr(left_reminder_offset, left_reminder_length).c_str());
+#endif
+                        it++;
+                    }
+
+                    // special token
+                    buffer.emplace_after(it, special_id);
+                    it++;
+
+                    // right
+                    if (match + special_token.length() < raw_text_base_offset + raw_text_base_length) {
+                        const int64_t right_reminder_offset = match + special_token.length();
+                        const int64_t right_reminder_length = raw_text_base_length - ((match - raw_text_base_offset) + special_token.length());
+                        buffer.emplace_after(it, (*raw_text), right_reminder_offset, right_reminder_length);
+
+#ifdef PRETOKENIZERDEBUG
+                        fprintf(stderr, "FR: (%ld %ld) '%s'\n", right_reminder_offset, right_reminder_length, raw_text->substr(right_reminder_offset, right_reminder_length).c_str());
+#endif
+
+                        it++;
+
+                        if (source == 0) {
+                            buffer.erase_after(buffer.before_begin());
+                        } else {
+                            buffer.erase_after(std::next(buffer.begin(), (source-1)));
+                        }
+
+                        // repeat for the right side
+                        raw_text_base_offset = right_reminder_offset;
+                        raw_text_base_length = right_reminder_length;
+
+#ifdef PRETOKENIZERDEBUG
+                        fprintf(stderr, "RR: (%ld %ld) '%s'\n", raw_text_base_offset, raw_text_base_length, raw_text->substr(raw_text_base_offset, raw_text_base_length).c_str());
+#endif
+                    } else {
+                        if (source == 0) {
+                            buffer.erase_after(buffer.before_begin());
+                        } else {
+                            buffer.erase_after(std::next(buffer.begin(), (source-1)));
+                        }
+                        break;
+                    }
+                }
+            }
+            it++;
+        }
+    }
+}
+
+static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab & vocab, std::string raw_text, bool bos, bool special) {
     std::vector<llama_vocab::id> output;
 
     // OG tokenizer behavior:
@@ -6481,20 +5942,58 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
         return output;
     }
 
+    std::forward_list<fragment_buffer_variant> fragment_buffer;
+    fragment_buffer.emplace_front( raw_text, 0, raw_text.length() );
+
+    if (special) tokenizer_st_partition( vocab, fragment_buffer );
+
     switch (vocab.type) {
         case LLAMA_VOCAB_TYPE_SPM:
             {
-                // without adding this leading whitespace, we do not get the same results as the original tokenizer
-                raw_text = " " + raw_text;
+                for (const auto & fragment: fragment_buffer)
+                {
+                    if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT)
+                    {
+                        // without adding this leading whitespace, we do not get the same results as the original tokenizer
 
-                llm_tokenizer_spm tokenizer(vocab);
-                llama_escape_whitespace(raw_text);
-                tokenizer.tokenize(raw_text, output);
+                        // TODO: It's likely possible to get rid of this string copy entirely
+                        //  by modifying llm_tokenizer_x to operate with string offsets like pre-tokenizer
+                        //  and passing 'add space prefix' as bool argument
+                        //
+                        auto raw_text = (special ? "" : " ") + fragment.raw_text.substr(fragment.offset, fragment.length);
+
+#ifdef PRETOKENIZERDEBUG
+                        fprintf(stderr,"TT: (%ld %ld %ld) '%s'\n", raw_text.length(), fragment.offset, fragment.length, raw_text.c_str());
+#endif
+                        llm_tokenizer_spm tokenizer(vocab);
+                        llama_escape_whitespace(raw_text);
+                        tokenizer.tokenize(raw_text, output);
+                    }
+                    else // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
+                    {
+                        output.push_back(fragment.token);
+                    }
+                }
             } break;
         case LLAMA_VOCAB_TYPE_BPE:
             {
-                llm_tokenizer_bpe tokenizer(vocab);
-                tokenizer.tokenize(raw_text, output);
+                for (const auto & fragment: fragment_buffer)
+                {
+                    if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT)
+                    {
+                        auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
+
+#ifdef PRETOKENIZERDEBUG
+                        fprintf(stderr,"TT: (%ld %ld %ld) '%s'\n", raw_text.length(), fragment.offset, fragment.length, raw_text.c_str());
+#endif
+                        llm_tokenizer_bpe tokenizer(vocab);
+                        tokenizer.tokenize(raw_text, output);
+                    }
+                    else // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
+                    {
+                        output.push_back(fragment.token);
+                    }
+                }
             } break;
     }
 
@@ -6767,7 +6266,7 @@ static std::vector<llama_grammar_candidate> llama_grammar_reject_candidates_for_
     std::vector<llama_grammar_candidate> rejects;
 
     if (stack.empty()) {
-        for (auto tok : candidates) {
+        for (const auto & tok : candidates) {
             if (*tok.code_points != 0 || tok.partial_utf8.n_remain != 0) {
                 rejects.push_back(tok);
             }
@@ -6778,7 +6277,7 @@ static std::vector<llama_grammar_candidate> llama_grammar_reject_candidates_for_
     const llama_grammar_element * stack_pos = stack.back();
 
     std::vector<llama_grammar_candidate> next_candidates;
-    for (auto tok : candidates) {
+    for (const auto & tok : candidates) {
         if (*tok.code_points == 0) {
             // reached end of full codepoints in token, reject iff it ended in a partial sequence
             // that cannot satisfy this position in grammar
@@ -6804,7 +6303,7 @@ static std::vector<llama_grammar_candidate> llama_grammar_reject_candidates_for_
     llama_grammar_advance_stack(rules, stack_after, next_stacks);
 
     auto next_rejects = llama_grammar_reject_candidates(rules, next_stacks, next_candidates);
-    for (auto tok : next_rejects) {
+    for (const auto & tok : next_rejects) {
         rejects.push_back({ tok.index, tok.code_points - 1, tok.partial_utf8 });
     }
 
@@ -6993,6 +6492,32 @@ void llama_sample_top_p(struct llama_context * ctx, llama_token_data_array * can
     }
 }
 
+void llama_sample_min_p(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep) {
+    if (p <= 0.0f || !candidates->size) {
+        return;
+    }
+
+    llama_sample_softmax(ctx, candidates);
+
+    const int64_t t_start_sample_us = ggml_time_us();
+
+    float scale = candidates->data[0].p; // scale by max prob
+    size_t i = 1; // first token always matches
+
+    for (; i < candidates->size; ++i) {
+        if (candidates->data[i].p < p * scale && i >= min_keep) {
+            break; // prob too small
+        }
+    }
+
+    // Resize the output vector to keep only the matching tokens
+    candidates->size = i;
+
+    if (ctx) {
+        ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
+    }
+}
+
 void llama_sample_tail_free(struct llama_context * ctx, llama_token_data_array * candidates, float z, size_t min_keep) {
     if (z >= 1.0f || candidates->size <= 2) {
         return;
@@ -7131,37 +6656,15 @@ void llama_sample_temperature(struct llama_context * ctx, llama_token_data_array
     llama_sample_temp(ctx, candidates_p, temp);
 }
 
-void llama_sample_repetition_penalty(struct llama_context * ctx, llama_token_data_array * candidates, const llama_token * last_tokens, size_t last_tokens_size, float penalty) {
-    if (last_tokens_size == 0 || penalty == 1.0f) {
-        return;
-    }
-
-    const int64_t t_start_sample_us = ggml_time_us();
-
-    for (size_t i = 0; i < candidates->size; ++i) {
-        const auto * token_iter = std::find(last_tokens, last_tokens + last_tokens_size, candidates->data[i].id);
-        if (token_iter == last_tokens + last_tokens_size) {
-            continue;
-        }
-
-        // The academic publication that described this technique actually just only divided, but that would cause tokens with negative logits to become more likely, which is obviously wrong.
-        // This is common fix for this problem, which is to multiply by the penalty instead of dividing.
-        if (candidates->data[i].logit <= 0) {
-            candidates->data[i].logit *= penalty;
-        } else {
-            candidates->data[i].logit /= penalty;
-        }
-    }
-
-    candidates->sorted = false;
-
-    if (ctx) {
-        ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
-    }
-}
-
-void llama_sample_frequency_and_presence_penalties(struct llama_context * ctx, llama_token_data_array * candidates, const llama_token * last_tokens_p, size_t last_tokens_size, float alpha_frequency, float alpha_presence) {
-    if (last_tokens_size == 0 || (alpha_frequency == 0.0f && alpha_presence == 0.0f)) {
+void llama_sample_repetition_penalties(
+            struct llama_context * ctx,
+          llama_token_data_array * candidates,
+               const llama_token * last_tokens,
+                          size_t   penalty_last_n,
+                           float   penalty_repeat,
+                           float   penalty_freq,
+                           float   penalty_present) {
+    if (penalty_last_n == 0 || (penalty_repeat == 1.0f && penalty_freq == 0.0f && penalty_present == 0.0f)) {
         return;
     }
 
@@ -7169,19 +6672,28 @@ void llama_sample_frequency_and_presence_penalties(struct llama_context * ctx, l
 
     // Create a frequency map to count occurrences of each token in last_tokens
     std::unordered_map<llama_token, int> token_count;
-    for (size_t i = 0; i < last_tokens_size; ++i) {
-        token_count[last_tokens_p[i]]++;
+    for (size_t i = 0; i < penalty_last_n; ++i) {
+        token_count[last_tokens[i]]++;
     }
 
     // Apply frequency and presence penalties to the candidates
     for (size_t i = 0; i < candidates->size; ++i) {
-        auto token_iter = token_count.find(candidates->data[i].id);
+        const auto token_iter = token_count.find(candidates->data[i].id);
         if (token_iter == token_count.end()) {
             continue;
         }
 
-        int count = token_iter->second;
-        candidates->data[i].logit -= float(count) * alpha_frequency + float(count > 0) * alpha_presence;
+        const int count = token_iter->second;
+
+        // The academic publication that described this technique actually just only divided, but that would cause tokens with negative logits to become more likely, which is obviously wrong.
+        // This is common fix for this problem, which is to multiply by the penalty instead of dividing.
+        if (candidates->data[i].logit <= 0) {
+            candidates->data[i].logit *= penalty_repeat;
+        } else {
+            candidates->data[i].logit /= penalty_repeat;
+        }
+
+        candidates->data[i].logit -= float(count) * penalty_freq + float(count > 0) * penalty_present;
     }
 
     candidates->sorted = false;
@@ -7203,14 +6715,14 @@ void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * c
         }
     }
 
-    const llama_token eos = llama_token_eos(ctx);
+    const llama_token eos = llama_token_eos(&ctx->model);
 
     std::vector<std::pair<std::vector<uint32_t>, llama_partial_utf8>> candidates_decoded;
     std::vector<llama_grammar_candidate>                              candidates_grammar;
 
     for (size_t i = 0; i < candidates->size; ++i) {
         const llama_token id    = candidates->data[i].id;
-        const std::string piece = llama_token_to_str(ctx, id);
+        const std::string piece = llama_token_to_piece(ctx, id);
         if (id == eos) {
             if (!allow_eos) {
                 candidates->data[i].logit = -INFINITY;
@@ -7413,7 +6925,7 @@ llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_arra
 void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar * grammar, llama_token token) {
     const int64_t t_start_sample_us = ggml_time_us();
 
-    if (token == llama_token_eos(ctx)) {
+    if (token == llama_token_eos(&ctx->model)) {
         for (const auto & stack : grammar->stacks) {
             if (stack.empty()) {
                 return;
@@ -7422,7 +6934,7 @@ void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar
         GGML_ASSERT(false);
     }
 
-    const std::string piece = llama_token_to_str(ctx, token);
+    const std::string piece = llama_token_to_piece(ctx, token);
 
     // Note terminating 0 in decoded string
     const auto   decoded     = decode_utf8(piece.c_str(), grammar->partial_utf8);
@@ -7695,6 +7207,24 @@ struct no_init {
     no_init() { /* do nothing */ }
 };
 
+struct quantize_state_internal {
+    const llama_model                 & model;
+    const llama_model_quantize_params * params;
+
+    int n_attention_wv    = 0;
+    int n_feed_forward_w2 = 0;
+    int i_attention_wv    = 0;
+    int i_feed_forward_w2 = 0;
+
+    int n_k_quantized     = 0;
+    int n_fallback        = 0;
+
+    quantize_state_internal(const llama_model & model, const llama_model_quantize_params * params)
+        : model(model)
+        , params(params)
+        {}
+};
+
 static void llama_convert_tensor_internal(
     struct ggml_tensor * tensor, std::vector<no_init<float>> & output, std::vector<std::thread> & workers,
     const size_t nelements, const int nthread
@@ -7753,14 +7283,14 @@ static void llama_convert_tensor_internal(
     workers.clear();
 }
 
-#ifdef GGML_USE_K_QUANTS
 static ggml_type get_k_quant_type(
-    ggml_type new_type, const ggml_tensor * tensor, const llama_model & model, llama_ftype ftype, int * i_attention_wv,
-    int n_attention_wv, int * i_feed_forward_w2, int n_feed_forward_w2
+    quantize_state_internal & qs,
+    ggml_type new_type, const ggml_tensor * tensor, llama_ftype ftype
 ) {
     const std::string name = ggml_get_name(tensor);
     // TODO: avoid hardcoded tensor names - use the TN_* constants
-    const auto tn = LLM_TN(model.arch);
+    const llm_arch arch = qs.model.arch;
+    const auto       tn = LLM_TN(arch);
 
     auto use_more_bits = [](int i_layer, int num_layers) -> bool {
         return i_layer < num_layers/8 || i_layer >= 7*num_layers/8 || (i_layer - num_layers/8)%3 == 2;
@@ -7768,7 +7298,7 @@ static ggml_type get_k_quant_type(
 
     if (name == tn(LLM_TENSOR_OUTPUT, "weight")) {
         int nx = tensor->ne[0];
-        if (model.arch == LLM_ARCH_FALCON || nx % QK_K != 0) {
+        if (arch == LLM_ARCH_FALCON || nx % QK_K != 0) {
             new_type = GGML_TYPE_Q8_0;
         }
         else if (new_type != GGML_TYPE_Q8_0) {
@@ -7777,46 +7307,46 @@ static ggml_type get_k_quant_type(
     } else if (name.find("attn_v.weight") != std::string::npos) {
         if      (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
         else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
-            new_type = *i_attention_wv < 2 ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
+            new_type = qs.i_attention_wv < 2 ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
         }
         else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
         else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
-                use_more_bits(*i_attention_wv, n_attention_wv)) new_type = GGML_TYPE_Q6_K;
-        else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && *i_attention_wv < 4) new_type = GGML_TYPE_Q5_K;
+                use_more_bits(qs.i_attention_wv, qs.n_attention_wv)) new_type = GGML_TYPE_Q6_K;
+        else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && qs.i_attention_wv < 4) new_type = GGML_TYPE_Q5_K;
         else if (QK_K == 64 && (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S) &&
-                (*i_attention_wv < n_attention_wv/8 || *i_attention_wv >= 7*n_attention_wv/8)) new_type = GGML_TYPE_Q6_K;
-        if (model.type == MODEL_70B) {
+                (qs.i_attention_wv < qs.n_attention_wv/8 || qs.i_attention_wv >= 7*qs.n_attention_wv/8)) new_type = GGML_TYPE_Q6_K;
+        if (qs.model.type == MODEL_70B) {
             // In the 70B model we have 8 heads sharing the same attn_v weights. As a result, the attn_v.weight tensor is
             // 8x smaller compared to attn_q.weight. Hence, we can get a nice boost in quantization accuracy with
             // nearly negligible increase in model size by quantizing this tensor with more bits:
             if (new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K) new_type = GGML_TYPE_Q5_K;
         }
-        ++*i_attention_wv;
+        ++qs.i_attention_wv;
     } else if (name.find("ffn_down.weight") != std::string::npos) {
         if      (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
         else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
-            new_type = *i_feed_forward_w2 < 2 ? GGML_TYPE_Q5_K
-                     : model.arch != LLM_ARCH_FALCON || use_more_bits(*i_feed_forward_w2, n_feed_forward_w2) ? GGML_TYPE_Q4_K
+            new_type = qs.i_feed_forward_w2 < 2 ? GGML_TYPE_Q5_K
+                     : arch != LLM_ARCH_FALCON || use_more_bits(qs.i_feed_forward_w2, qs.n_feed_forward_w2) ? GGML_TYPE_Q4_K
                      : GGML_TYPE_Q3_K;
         }
         else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) {
-            new_type = model.arch == LLM_ARCH_FALCON ? GGML_TYPE_Q4_K : GGML_TYPE_Q5_K;
+            new_type = arch == LLM_ARCH_FALCON ? GGML_TYPE_Q4_K : GGML_TYPE_Q5_K;
         }
         else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) {
-            if (model.arch == LLM_ARCH_FALCON) {
-                new_type = *i_feed_forward_w2 < 2 ? GGML_TYPE_Q6_K :
-                           use_more_bits(*i_feed_forward_w2, n_feed_forward_w2) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
+            if (arch == LLM_ARCH_FALCON) {
+                new_type = qs.i_feed_forward_w2 < 2 ? GGML_TYPE_Q6_K :
+                           use_more_bits(qs.i_feed_forward_w2, qs.n_feed_forward_w2) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
             } else {
-                if (use_more_bits(*i_feed_forward_w2, n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K;
+                if (use_more_bits(qs.i_feed_forward_w2, qs.n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K;
             }
         }
-        else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M && use_more_bits(*i_feed_forward_w2, n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K;
-        else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && model.arch != LLM_ARCH_FALCON && *i_feed_forward_w2 < 4) {
+        else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M && use_more_bits(qs.i_feed_forward_w2, qs.n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K;
+        else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && arch != LLM_ARCH_FALCON && qs.i_feed_forward_w2 < 4) {
             new_type = GGML_TYPE_Q5_K;
         }
-        ++*i_feed_forward_w2;
+        ++qs.i_feed_forward_w2;
     } else if (name.find("attn_output.weight") != std::string::npos) {
-        if (model.arch != LLM_ARCH_FALCON) {
+        if (arch != LLM_ARCH_FALCON) {
             if      (ftype == LLAMA_FTYPE_MOSTLY_Q2_K  ) new_type = GGML_TYPE_Q3_K;
             else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) new_type = GGML_TYPE_Q4_K;
             else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
@@ -7843,25 +7373,27 @@ static ggml_type get_k_quant_type(
         int nx = tensor->ne[0];
         int ny = tensor->ne[1];
         if (nx % QK_K != 0) {
-            LLAMA_LOG_WARN("\n\n%s : tensor cols %d x %d are not divisible by %d, required for k-quants\n", __func__, nx, ny, QK_K);
+            LLAMA_LOG_WARN("\n\n%s : tensor cols %d x %d are not divisible by %d, required for %s", __func__, nx, ny, QK_K, ggml_type_name(new_type));
             convert_incompatible_tensor = true;
+        } else {
+            ++qs.n_k_quantized;
         }
     }
     if (convert_incompatible_tensor) {
-        if (name == tn(LLM_TENSOR_OUTPUT, "weight")) {
-            new_type = GGML_TYPE_F16; //fall back to F16 instead of just failing.
-            LLAMA_LOG_WARN("F16 will be used for this tensor instead.\n");
-        } else if (name == tn(LLM_TENSOR_TOKEN_EMBD, "weight")) {
-            new_type = GGML_TYPE_Q4_0; //fall back to Q4_0 instead of just failing.
-            LLAMA_LOG_WARN("Q4_0 will be used for this tensor instead.\n");
-        } else {
-            throw std::runtime_error("Unsupported tensor size encountered\n");
+        switch (new_type) {
+            case GGML_TYPE_Q2_K: new_type = GGML_TYPE_Q4_0; break;
+            case GGML_TYPE_Q3_K: new_type = GGML_TYPE_Q4_1; break;
+            case GGML_TYPE_Q4_K: new_type = GGML_TYPE_Q5_0; break;
+            case GGML_TYPE_Q5_K: new_type = GGML_TYPE_Q5_1; break;
+            case GGML_TYPE_Q6_K: new_type = GGML_TYPE_Q8_0; break;
+            default: throw std::runtime_error("\nUnsupported tensor size encountered\n");
         }
+        LLAMA_LOG_WARN(" - using fallback quantization %s\n", ggml_type_name(new_type));
+        ++qs.n_fallback;
     }
 
     return new_type;
 }
-#endif
 
 static void llama_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, const llama_model_quantize_params * params) {
     ggml_type quantized_type;
@@ -7876,7 +7408,6 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
         case LLAMA_FTYPE_MOSTLY_F16:  quantized_type = GGML_TYPE_F16;  break;
         case LLAMA_FTYPE_ALL_F32:     quantized_type = GGML_TYPE_F32;  break;
 
-#ifdef GGML_USE_K_QUANTS
         // K-quants
         case LLAMA_FTYPE_MOSTLY_Q2_K:   quantized_type = GGML_TYPE_Q2_K; break;
         case LLAMA_FTYPE_MOSTLY_Q3_K_S:
@@ -7887,7 +7418,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
         case LLAMA_FTYPE_MOSTLY_Q5_K_S:
         case LLAMA_FTYPE_MOSTLY_Q5_K_M: quantized_type = GGML_TYPE_Q5_K; break;
         case LLAMA_FTYPE_MOSTLY_Q6_K:   quantized_type = GGML_TYPE_Q6_K; break;
-#endif
+
         default: throw std::runtime_error(format("invalid output file type %d\n", ftype));
     }
 
@@ -7914,6 +7445,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
     llm_load_arch(ml, model);
     llm_load_hparams(ml, model);
 
+    struct quantize_state_internal qs(model, params);
+
     if (params->only_copy) {
         ftype = model.ftype;
     }
@@ -7926,10 +7459,6 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
     gguf_set_val_u32(ctx_out, "general.quantization_version", GGML_QNT_VERSION);
     gguf_set_val_u32(ctx_out, "general.file_type", ftype);
 
-#ifdef GGML_USE_K_QUANTS
-    int n_attention_wv    = 0;
-    int n_feed_forward_w2 = 0;
-
     for (int i = 0; i < ml.n_tensors; ++i) {
         struct ggml_tensor * meta = ml.get_tensor_meta(i);
 
@@ -7937,21 +7466,17 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
 
         // TODO: avoid hardcoded tensor names - use the TN_* constants
         if (name.find("attn_v.weight") != std::string::npos || name.find("attn_qkv.weight") != std::string::npos) {
-            ++n_attention_wv;
+            ++qs.n_attention_wv;
         }
         else if (name.find("ffn_down.weight") != std::string::npos) {
-            ++n_feed_forward_w2;
+            ++qs.n_feed_forward_w2;
         }
     }
-    if (n_attention_wv != n_feed_forward_w2 || (uint32_t)n_attention_wv != model.hparams.n_layer) {
+    if (qs.n_attention_wv != qs.n_feed_forward_w2 || (uint32_t)qs.n_attention_wv != model.hparams.n_layer) {
         LLAMA_LOG_WARN("%s ============ Strange model: n_attention_wv = %d, n_feed_forward_w2 = %d, hparams.n_layer = %d\n",
-                __func__, n_attention_wv, n_feed_forward_w2, model.hparams.n_layer);
+                __func__, qs.n_attention_wv, qs.n_feed_forward_w2, model.hparams.n_layer);
     }
 
-    int i_attention_wv = 0;
-    int i_feed_forward_w2 = 0;
-#endif
-
     size_t total_size_org = 0;
     size_t total_size_new = 0;
     std::vector<int64_t> hist_all(1 << 4, 0);
@@ -8015,11 +7540,10 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
 
         if (quantize) {
             new_type = quantized_type;
-#ifdef GGML_USE_K_QUANTS
-            new_type = get_k_quant_type(
-                new_type, tensor, model, ftype, &i_attention_wv, n_attention_wv, &i_feed_forward_w2, n_feed_forward_w2
-            );
-#endif
+            if (!params->pure) {
+                new_type = get_k_quant_type(qs, new_type, tensor, ftype);
+            }
+
             // If we've decided to quantize to the same type the tensor is already
             // in then there's nothing to do.
             quantize = tensor->type != new_type;
@@ -8144,6 +7668,11 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
             LLAMA_LOG_INFO("\n");
         }
     }
+
+    if (qs.n_fallback > 0) {
+        LLAMA_LOG_WARN("%s: WARNING: %d of %d tensor(s) incompatible with k-quants and required fallback quantization\n",
+                __func__, qs.n_fallback, qs.n_k_quantized + qs.n_fallback);
+    }
 }
 
 static int llama_apply_lora_from_file_internal(
@@ -8308,14 +7837,14 @@ static int llama_apply_lora_from_file_internal(
 
             ggml_tensor * dest_t = model_tensors[base_name];
 
-            offload_func_t offload_func = llama_nop;
-            offload_func_t offload_func_force_inplace = llama_nop;
+            offload_func_t offload_func               = ggml_offload_nop;
+            offload_func_t offload_func_force_inplace = ggml_offload_nop;
 
 #ifdef GGML_USE_CUBLAS
             if (dest_t->backend == GGML_BACKEND_GPU || dest_t->backend == GGML_BACKEND_GPU_SPLIT) {
                 if (dest_t->type != GGML_TYPE_F16) {
                     throw std::runtime_error(format(
-                        "%s: error: the simultaneous use of LoRAs and GPU acceleration is only supported for f16 models", __func__));
+                        "%s: error: the simultaneous use of LoRAs and GPU acceleration is only supported for f16 models. dest_t->type: %d", __func__, dest_t->type));
                 }
                 offload_func = ggml_cuda_assign_buffers;
                 offload_func_force_inplace = ggml_cuda_assign_buffers_force_inplace;
@@ -8450,8 +7979,14 @@ struct llama_context_params llama_context_default_params() {
         /*.n_batch                     =*/ 512,
         /*.n_threads                   =*/ GGML_DEFAULT_N_THREADS, // TODO: better default
         /*.n_threads_batch             =*/ GGML_DEFAULT_N_THREADS,
+        /*.rope_scaling_type           =*/ LLAMA_ROPE_SCALING_UNSPECIFIED,
         /*.rope_freq_base              =*/ 0.0f,
         /*.rope_freq_scale             =*/ 0.0f,
+        /*.yarn_ext_factor             =*/ NAN,
+        /*.yarn_attn_factor            =*/ 1.0f,
+        /*.yarn_beta_fast              =*/ 32.0f,
+        /*.yarn_beta_slow              =*/ 1.0f,
+        /*.yarn_orig_ctx               =*/ 0,
         /*.mul_mat_q                   =*/ true,
         /*.f16_kv                      =*/ true,
         /*.logits_all                  =*/ false,
@@ -8468,6 +8003,7 @@ struct llama_model_quantize_params llama_model_quantize_default_params() {
         /*.allow_requantize            =*/ false,
         /*.quantize_output_tensor      =*/ true,
         /*.only_copy                   =*/ false,
+        /*.pure                        =*/ false,
     };
 
     return result;
@@ -8537,10 +8073,7 @@ struct llama_model * llama_load_model_from_file(
         };
     }
 
-    if (!llama_model_load(path_model, *model, params.n_gpu_layers,
-                params.main_gpu, params.tensor_split,
-                params.use_mmap, params.use_mlock, params.vocab_only,
-                params.progress_callback, params.progress_callback_user_data)) {
+    if (!llama_model_load(path_model, *model, params)) {
         LLAMA_LOG_ERROR("%s: failed to load model\n", __func__);
         delete model;
         return nullptr;
@@ -8566,13 +8099,35 @@ struct llama_context * llama_new_context_with_model(
     const auto & hparams = model->hparams;
     auto       & cparams = ctx->cparams;
 
-    cparams.n_batch         = params.n_batch;
-    cparams.n_ctx           = params.n_ctx == 0           ? hparams.n_ctx_train           : params.n_ctx;
-    cparams.rope_freq_base  = params.rope_freq_base == 0  ? hparams.rope_freq_base_train  : params.rope_freq_base;
-    cparams.rope_freq_scale = params.rope_freq_scale == 0 ? hparams.rope_freq_scale_train : params.rope_freq_scale;
-    cparams.n_threads       = params.n_threads;
-    cparams.n_threads_batch = params.n_threads_batch;
-    cparams.mul_mat_q       = params.mul_mat_q;
+    cparams.n_batch          = params.n_batch;
+    cparams.n_threads        = params.n_threads;
+    cparams.n_threads_batch  = params.n_threads_batch;
+    cparams.yarn_ext_factor  = params.yarn_ext_factor;
+    cparams.yarn_attn_factor = params.yarn_attn_factor;
+    cparams.yarn_beta_fast   = params.yarn_beta_fast;
+    cparams.yarn_beta_slow   = params.yarn_beta_slow;
+    cparams.mul_mat_q        = params.mul_mat_q;
+
+    cparams.n_ctx            = params.n_ctx           == 0    ? hparams.n_ctx_train           : params.n_ctx;
+    cparams.rope_freq_base   = params.rope_freq_base  == 0.0f ? hparams.rope_freq_base_train  : params.rope_freq_base;
+    cparams.rope_freq_scale  = params.rope_freq_scale == 0.0f ? hparams.rope_freq_scale_train : params.rope_freq_scale;
+
+    cparams.n_yarn_orig_ctx  = params.yarn_orig_ctx    != 0 ? params.yarn_orig_ctx    :
+                               hparams.n_yarn_orig_ctx != 0 ? hparams.n_yarn_orig_ctx :
+                                                              hparams.n_ctx_train;
+
+    auto rope_scaling_type = params.rope_scaling_type;
+    if (rope_scaling_type == LLAMA_ROPE_SCALING_UNSPECIFIED) {
+        rope_scaling_type = hparams.rope_scaling_type_train;
+    }
+
+    if (rope_scaling_type == LLAMA_ROPE_SCALING_NONE) {
+        cparams.rope_freq_scale = 1.0f; // never scale if scaling type is none
+    }
+
+    if (std::isnan(cparams.yarn_ext_factor)) { // NaN indicates 'not set'
+        cparams.yarn_ext_factor = rope_scaling_type == LLAMA_ROPE_SCALING_YARN ? 1.0f : 0.0f;
+    }
 
     if (params.seed == LLAMA_DEFAULT_SEED) {
         params.seed = time(NULL);
@@ -8622,7 +8177,7 @@ struct llama_context * llama_new_context_with_model(
             // build worst-case graph
             int n_tokens = (int)std::min(cparams.n_ctx, cparams.n_batch);
             int n_past = cparams.n_ctx - n_tokens;
-            llama_token token = llama_token_bos(ctx); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph
+            llama_token token = llama_token_bos(&ctx->model); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph
             ggml_cgraph * gf = llama_build_graph(*ctx, llama_batch_get_one(&token, n_tokens, n_past, 0));
 
 #ifdef GGML_USE_METAL
@@ -8828,8 +8383,8 @@ int llama_get_kv_cache_token_count(const struct llama_context * ctx) {
     return ctx->kv_self.head;
 }
 
-void llama_kv_cache_tokens_rm(struct llama_context * ctx, int32_t c0, int32_t c1) {
-    llama_kv_cache_tokens_rm(ctx->kv_self, c0, c1);
+void llama_kv_cache_clear(struct llama_context * ctx) {
+    llama_kv_cache_clear(ctx->kv_self);
 }
 
 void llama_kv_cache_seq_rm(struct llama_context * ctx, llama_seq_id seq_id, llama_pos p0, llama_pos p1) {
@@ -8837,6 +8392,9 @@ void llama_kv_cache_seq_rm(struct llama_context * ctx, llama_seq_id seq_id, llam
 }
 
 void llama_kv_cache_seq_cp(struct llama_context * ctx, llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) {
+    if (seq_id_src == seq_id_dst) {
+        return;
+    }
     llama_kv_cache_seq_cp(ctx->kv_self, seq_id_src, seq_id_dst, p0, p1);
 }
 
@@ -9272,7 +8830,7 @@ int llama_eval(
                  llama_token * tokens,
                      int32_t   n_tokens,
                          int   n_past) {
-    llama_kv_cache_tokens_rm(ctx->kv_self, n_past, -1);
+    llama_kv_cache_seq_rm(ctx->kv_self, -1, n_past, -1);
 
     const int ret = llama_decode_internal(*ctx, llama_batch_get_one(tokens, n_tokens, n_past, 0));
     if (ret < 0) {
@@ -9287,9 +8845,9 @@ int llama_eval_embd(
                            float * embd,
                          int32_t   n_tokens,
                              int   n_past) {
-    llama_kv_cache_tokens_rm(ctx->kv_self, n_past, -1);
+    llama_kv_cache_seq_rm(ctx->kv_self, -1, n_past, -1);
 
-    llama_batch batch = { n_tokens, nullptr, embd, nullptr, nullptr, nullptr, n_past, 1, 0, };
+    llama_batch batch = { n_tokens, nullptr, embd, nullptr, nullptr, nullptr, nullptr, n_past, 1, 0, };
 
     const int ret = llama_decode_internal(*ctx, batch);
     if (ret < 0) {
@@ -9310,20 +8868,21 @@ struct llama_batch llama_batch_get_one(
                llama_pos   pos_0,
             llama_seq_id   seq_id) {
     return {
-        /*n_tokens    =*/ n_tokens,
-        /*tokens      =*/ tokens,
-        /*embd        =*/ nullptr,
-        /*pos         =*/ nullptr,
-        /*seq_id      =*/ nullptr,
-        /*logits      =*/ nullptr,
-        /*all_pos_0   =*/ pos_0,
-        /*all_pos_1   =*/ 1,
-        /*all_seq_id  =*/ seq_id,
+        /*n_tokens       =*/ n_tokens,
+        /*tokens         =*/ tokens,
+        /*embd           =*/ nullptr,
+        /*pos            =*/ nullptr,
+        /*n_seq_id       =*/ nullptr,
+        /*seq_id         =*/ nullptr,
+        /*logits         =*/ nullptr,
+        /*all_pos_0      =*/ pos_0,
+        /*all_pos_1      =*/ 1,
+        /*all_seq_id     =*/ seq_id,
     };
 }
 
-struct llama_batch llama_batch_init(int32_t n_tokens, int32_t embd) {
-    llama_batch batch = { -1, nullptr, nullptr, nullptr, nullptr, nullptr, 0, 0, 0, };
+struct llama_batch llama_batch_init(int32_t n_tokens, int32_t embd, int32_t n_seq_max) {
+    llama_batch batch = { 0, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, 0, 0, 0, };
 
     if (embd) {
         batch.embd = (float *) malloc(sizeof(float) * n_tokens * embd);
@@ -9331,19 +8890,29 @@ struct llama_batch llama_batch_init(int32_t n_tokens, int32_t embd) {
         batch.token = (llama_token *) malloc(sizeof(llama_token) * n_tokens);
     }
 
-    batch.pos    = (llama_pos *)    malloc(sizeof(llama_pos)    * n_tokens);
-    batch.seq_id = (llama_seq_id *) malloc(sizeof(llama_seq_id) * n_tokens);
-    batch.logits = (int8_t *)       malloc(sizeof(int8_t)       * n_tokens);
+    batch.pos      = (llama_pos *)     malloc(sizeof(llama_pos)      * n_tokens);
+    batch.n_seq_id = (int32_t *)       malloc(sizeof(int32_t)        * n_tokens);
+    batch.seq_id   = (llama_seq_id **) malloc(sizeof(llama_seq_id *) * n_tokens);
+    for (int i = 0; i < n_tokens; ++i) {
+        batch.seq_id[i] = (llama_seq_id *) malloc(sizeof(llama_seq_id) * n_seq_max);
+    }
+    batch.logits   = (int8_t *)        malloc(sizeof(int8_t)         * n_tokens);
 
     return batch;
 }
 
 void llama_batch_free(struct llama_batch batch) {
-    if (batch.token)  free(batch.token);
-    if (batch.embd)   free(batch.embd);
-    if (batch.pos)    free(batch.pos);
-    if (batch.seq_id) free(batch.seq_id);
-    if (batch.logits) free(batch.logits);
+    if (batch.token)    free(batch.token);
+    if (batch.embd)     free(batch.embd);
+    if (batch.pos)      free(batch.pos);
+    if (batch.n_seq_id) free(batch.n_seq_id);
+    if (batch.seq_id) {
+        for (int i = 0; i < batch.n_tokens; ++i) {
+            free(batch.seq_id[i]);
+        }
+        free(batch.seq_id);
+    }
+    if (batch.logits)   free(batch.logits);
 }
 
 int llama_decode(
@@ -9369,45 +8938,45 @@ float * llama_get_embeddings(struct llama_context * ctx) {
     return ctx->embedding.data();
 }
 
-const char * llama_token_get_text(const struct llama_context * ctx, llama_token token) {
-    return ctx->model.vocab.id_to_token[token].text.c_str();
+const char * llama_token_get_text(const struct llama_model * model, llama_token token) {
+    return model->vocab.id_to_token[token].text.c_str();
 }
 
-float llama_token_get_score(const struct llama_context * ctx, llama_token token) {
-    return ctx->model.vocab.id_to_token[token].score;
+float llama_token_get_score(const struct llama_model * model, llama_token token) {
+    return model->vocab.id_to_token[token].score;
 }
 
-llama_token_type llama_token_get_type(const struct llama_context * ctx, llama_token token) {
-    return ctx->model.vocab.id_to_token[token].type;
+llama_token_type llama_token_get_type(const struct llama_model * model, llama_token token) {
+    return model->vocab.id_to_token[token].type;
 }
 
-llama_token llama_token_bos(const struct llama_context * ctx) {
-    return ctx->model.vocab.special_bos_id;
+llama_token llama_token_bos(const struct llama_model * model) {
+    return model->vocab.special_bos_id;
 }
 
-llama_token llama_token_eos(const struct llama_context * ctx) {
-    return ctx->model.vocab.special_eos_id;
+llama_token llama_token_eos(const struct llama_model * model) {
+    return model->vocab.special_eos_id;
 }
 
-llama_token llama_token_nl(const struct llama_context * ctx) {
-    return ctx->model.vocab.linefeed_id;
-}
-llama_token llama_token_prefix(const struct llama_context * ctx) {
-    return ctx->model.vocab.special_prefix_id;
+llama_token llama_token_nl(const struct llama_model * model) {
+    return model->vocab.linefeed_id;
 }
 
-llama_token llama_token_middle(const struct llama_context * ctx) {
-    return ctx->model.vocab.special_middle_id;
+llama_token llama_token_prefix(const struct llama_model * model) {
+    return model->vocab.special_prefix_id;
 }
 
-llama_token llama_token_suffix(const struct llama_context * ctx) {
-    return ctx->model.vocab.special_suffix_id;
+llama_token llama_token_middle(const struct llama_model * model) {
+    return model->vocab.special_middle_id;
 }
 
-llama_token llama_token_eot(const struct llama_context * ctx) {
-    return ctx->model.vocab.special_eot_id;
+llama_token llama_token_suffix(const struct llama_model * model) {
+    return model->vocab.special_suffix_id;
 }
 
+llama_token llama_token_eot(const struct llama_model * model) {
+    return model->vocab.special_eot_id;
+}
 
 int llama_tokenize(
     const struct llama_model * model,
@@ -9415,8 +8984,9 @@ int llama_tokenize(
                          int   text_len,
                  llama_token * tokens,
                          int   n_max_tokens,
-                        bool   add_bos) {
-    auto res = llama_tokenize_internal(model->vocab, std::string(text, text_len), add_bos);
+                        bool   add_bos,
+                        bool   special) {
+    auto res = llama_tokenize_internal(model->vocab, std::string(text, text_len), add_bos, special);
 
     if (n_max_tokens < (int) res.size()) {
         // LLAMA_LOG_ERROR("%s: too many tokens\n", __func__);
diff --git a/llama.h b/llama.h
index a78015ada..3f1becd76 100644
--- a/llama.h
+++ b/llama.h
@@ -106,6 +106,14 @@ extern "C" {
         LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
     };
 
+    enum llama_rope_scaling_type {
+        LLAMA_ROPE_SCALING_UNSPECIFIED = -1,
+        LLAMA_ROPE_SCALING_NONE        = 0,
+        LLAMA_ROPE_SCALING_LINEAR      = 1,
+        LLAMA_ROPE_SCALING_YARN        = 2,
+        LLAMA_ROPE_SCALING_MAX_VALUE   = LLAMA_ROPE_SCALING_YARN,
+    };
+
     typedef struct llama_token_data {
         llama_token id; // token id
         float logit;    // log-odds of the token
@@ -133,11 +141,12 @@ extern "C" {
     typedef struct llama_batch {
         int32_t n_tokens;
 
-        llama_token  * token;
-        float        * embd;
-        llama_pos    * pos;
-        llama_seq_id * seq_id;
-        int8_t       * logits;
+        llama_token  *  token;
+        float        *  embd;
+        llama_pos    *  pos;
+        int32_t      *  n_seq_id;
+        llama_seq_id ** seq_id;
+        int8_t       *  logits;
 
         // NOTE: helpers for smooth API transition - can be deprecated in the future
         //       for future-proof code, use the above fields instead and ignore everything below
@@ -171,13 +180,19 @@ extern "C" {
         uint32_t n_batch;         // prompt processing maximum batch size
         uint32_t n_threads;       // number of threads to use for generation
         uint32_t n_threads_batch; // number of threads to use for batch processing
+        int8_t   rope_scaling_type; // RoPE scaling type, from `enum llama_rope_scaling_type`
 
         // ref: https://github.com/ggerganov/llama.cpp/pull/2054
-        float rope_freq_base;  // RoPE base frequency, 0 = from model
-        float rope_freq_scale; // RoPE frequency scaling factor, 0 = from model
+        float    rope_freq_base;   // RoPE base frequency, 0 = from model
+        float    rope_freq_scale;  // RoPE frequency scaling factor, 0 = from model
+        float    yarn_ext_factor;  // YaRN extrapolation mix factor, NaN = from model
+        float    yarn_attn_factor; // YaRN magnitude scaling factor
+        float    yarn_beta_fast;   // YaRN low correction dim
+        float    yarn_beta_slow;   // YaRN high correction dim
+        uint32_t yarn_orig_ctx;    // YaRN original context size
 
         // Keep the booleans together to avoid misalignment during copy-by-value.
-        bool mul_mat_q;  // if true, use experimental mul_mat_q kernels
+        bool mul_mat_q;  // if true, use experimental mul_mat_q kernels (DEPRECATED - always true)
         bool f16_kv;     // use fp16 for KV cache, fp32 otherwise
         bool logits_all; // the llama_eval() call computes all logits, not just the last one
         bool embedding;  // embedding mode only
@@ -190,6 +205,7 @@ extern "C" {
         bool allow_requantize;       // allow quantizing non-f32/f16 tensors
         bool quantize_output_tensor; // quantize output.weight
         bool only_copy;              // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
+        bool pure;                   // disable k-quant mixtures and quantize all tensors to the same type
     } llama_model_quantize_params;
 
     // grammar types
@@ -332,17 +348,14 @@ extern "C" {
     LLAMA_API DEPRECATED(int llama_get_kv_cache_token_count(const struct llama_context * ctx),
             "avoid using this, it will be removed in the future, instead - count the tokens in user code");
 
-    // Remove all tokens data of cells in [c0, c1)
-    // c0 < 0 : [0,  c1]
-    // c1 < 0 : [c0, inf)
-    LLAMA_API void llama_kv_cache_tokens_rm(
-            struct llama_context * ctx,
-                         int32_t   c0,
-                         int32_t   c1);
+    // Clear the KV cache
+    LLAMA_API void llama_kv_cache_clear(
+            struct llama_context * ctx);
 
     // Removes all tokens that belong to the specified sequence and have positions in [p0, p1)
-    // p0 < 0 : [0,  p1]
-    // p1 < 0 : [p0, inf)
+    // seq_id < 0 : match any sequence
+    // p0 < 0     : [0,  p1]
+    // p1 < 0     : [p0, inf)
     LLAMA_API void llama_kv_cache_seq_rm(
             struct llama_context * ctx,
                     llama_seq_id   seq_id,
@@ -446,7 +459,8 @@ extern "C" {
                     llama_pos   pos_0,
                  llama_seq_id   seq_id);
 
-    // Allocates a batch of tokens on the heap
+    // Allocates a batch of tokens on the heap that can hold a maximum of n_tokens
+    // Each token can be assigned up to n_seq_max sequence ids
     // The batch has to be freed with llama_batch_free()
     // If embd != 0, llama_batch.embd will be allocated with size of n_tokens * embd * sizeof(float)
     // Otherwise, llama_batch.token will be allocated to store n_tokens llama_token
@@ -454,7 +468,8 @@ extern "C" {
     // All members are left uninitialized
     LLAMA_API struct llama_batch llama_batch_init(
             int32_t n_tokens,
-            int32_t embd);
+            int32_t embd,
+            int32_t n_seq_max);
 
     // Frees a batch of tokens allocated with llama_batch_init()
     LLAMA_API void llama_batch_free(struct llama_batch batch);
@@ -491,37 +506,41 @@ extern "C" {
     // Vocab
     //
 
-    LLAMA_API const char * llama_token_get_text(const struct llama_context * ctx, llama_token token);
+    LLAMA_API const char * llama_token_get_text(const struct llama_model * model, llama_token token);
 
-    LLAMA_API float llama_token_get_score(const struct llama_context * ctx, llama_token token);
+    LLAMA_API float llama_token_get_score(const struct llama_model * model, llama_token token);
 
-    LLAMA_API enum llama_token_type llama_token_get_type(const struct llama_context * ctx, llama_token token);
+    LLAMA_API enum llama_token_type llama_token_get_type(const struct llama_model * model, llama_token token);
 
     // Special tokens
-    LLAMA_API llama_token llama_token_bos(const struct llama_context * ctx);  // beginning-of-sentence
-    LLAMA_API llama_token llama_token_eos(const struct llama_context * ctx);  // end-of-sentence
-    LLAMA_API llama_token llama_token_nl (const struct llama_context * ctx);  // next-line
+    LLAMA_API llama_token llama_token_bos(const struct llama_model * model); // beginning-of-sentence
+    LLAMA_API llama_token llama_token_eos(const struct llama_model * model); // end-of-sentence
+    LLAMA_API llama_token llama_token_nl (const struct llama_model * model); // next-line
+
     // codellama infill tokens
-    LLAMA_API llama_token llama_token_prefix(const struct llama_context * ctx); // Beginning of infill prefix
-    LLAMA_API llama_token llama_token_middle(const struct llama_context * ctx); // Beginning of infill middle
-    LLAMA_API llama_token llama_token_suffix(const struct llama_context * ctx); // Beginning of infill suffix
-    LLAMA_API llama_token llama_token_eot   (const struct llama_context * ctx); // End of infill middle
+    LLAMA_API llama_token llama_token_prefix(const struct llama_model * model); // Beginning of infill prefix
+    LLAMA_API llama_token llama_token_middle(const struct llama_model * model); // Beginning of infill middle
+    LLAMA_API llama_token llama_token_suffix(const struct llama_model * model); // Beginning of infill suffix
+    LLAMA_API llama_token llama_token_eot   (const struct llama_model * model); // End of infill middle
 
     //
     // Tokenization
     //
 
-    // Convert the provided text into tokens.
-    // The tokens pointer must be large enough to hold the resulting tokens.
-    // Returns the number of tokens on success, no more than n_max_tokens
-    // Returns a negative number on failure - the number of tokens that would have been returned
+    /// @details Convert the provided text into tokens.
+    /// @param tokens The tokens pointer must be large enough to hold the resulting tokens.
+    /// @return Returns the number of tokens on success, no more than n_max_tokens
+    /// @return Returns a negative number on failure - the number of tokens that would have been returned
+    /// @param special Allow tokenizing special and/or control tokens which otherwise are not exposed and treated as plaintext.
+    ///                Does not insert a leading space.
     LLAMA_API int llama_tokenize(
         const struct llama_model * model,
                       const char * text,
                              int   text_len,
                      llama_token * tokens,
                              int   n_max_tokens,
-                            bool   add_bos);
+                            bool   add_bos,
+                            bool   special);
 
     // Token Id -> Piece.
     // Uses the vocabulary in the provided context.
@@ -554,21 +573,15 @@ extern "C" {
     LLAMA_API void llama_set_rng_seed(struct llama_context * ctx, uint32_t seed);
 
     /// @details Repetition penalty described in CTRL academic paper https://arxiv.org/abs/1909.05858, with negative logit fix.
-    LLAMA_API void llama_sample_repetition_penalty(
-            struct llama_context * ctx,
-          llama_token_data_array * candidates,
-               const llama_token * last_tokens,
-                          size_t   last_tokens_size,
-                          float    penalty);
-
     /// @details Frequency and presence penalties described in OpenAI API https://platform.openai.com/docs/api-reference/parameter-details.
-    LLAMA_API void llama_sample_frequency_and_presence_penalties(
+    LLAMA_API void llama_sample_repetition_penalties(
             struct llama_context * ctx,
           llama_token_data_array * candidates,
                const llama_token * last_tokens,
-                          size_t   last_tokens_size,
-                           float   alpha_frequency,
-                           float   alpha_presence);
+                          size_t   penalty_last_n,
+                           float   penalty_repeat,
+                           float   penalty_freq,
+                           float   penalty_present);
 
     /// @details Apply classifier-free guidance to the logits as described in academic paper "Stay on topic with Classifier-Free Guidance" https://arxiv.org/abs/2306.17806
     /// @param candidates A vector of `llama_token_data` containing the candidate tokens, the logits must be directly extracted from the original generation context without being sorted.
@@ -599,6 +612,13 @@ extern "C" {
                            float   p,
                           size_t   min_keep);
 
+    /// @details Minimum P sampling as described in https://github.com/ggerganov/llama.cpp/pull/3841
+    LLAMA_API void llama_sample_min_p(
+            struct llama_context * ctx,
+          llama_token_data_array * candidates,
+                           float   p,
+                          size_t   min_keep);
+
     /// @details Tail Free Sampling described in https://www.trentonbricken.com/Tail-Free-Sampling/.
     LLAMA_API void llama_sample_tail_free(
             struct llama_context * ctx,
@@ -657,6 +677,7 @@ extern "C" {
                            float * mu);
 
     /// @details Selects the token with the highest probability.
+    ///          Does not compute the token probabilities. Use llama_sample_softmax() instead.
     LLAMA_API llama_token llama_sample_token_greedy(
             struct llama_context * ctx,
           llama_token_data_array * candidates);
diff --git a/models/ggml-vocab-baichuan.gguf b/models/ggml-vocab-baichuan.gguf
new file mode 100644
index 000000000..7caaf8239
Binary files /dev/null and b/models/ggml-vocab-baichuan.gguf differ
diff --git a/models/ggml-vocab-gpt-neox.gguf b/models/ggml-vocab-gpt-neox.gguf
new file mode 100644
index 000000000..b9af16845
Binary files /dev/null and b/models/ggml-vocab-gpt-neox.gguf differ
diff --git a/models/ggml-vocab-llama.gguf b/models/ggml-vocab-llama.gguf
index 63bfaf672..549eed8c5 100644
Binary files a/models/ggml-vocab-llama.gguf and b/models/ggml-vocab-llama.gguf differ
diff --git a/models/ggml-vocab-mpt.gguf b/models/ggml-vocab-mpt.gguf
new file mode 100644
index 000000000..6affa34bd
Binary files /dev/null and b/models/ggml-vocab-mpt.gguf differ
diff --git a/models/ggml-vocab-refact.gguf b/models/ggml-vocab-refact.gguf
new file mode 100644
index 000000000..8f26cfb76
Binary files /dev/null and b/models/ggml-vocab-refact.gguf differ
diff --git a/models/ggml-vocab-starcoder.gguf b/models/ggml-vocab-starcoder.gguf
new file mode 100644
index 000000000..a52983fdb
Binary files /dev/null and b/models/ggml-vocab-starcoder.gguf differ
diff --git a/prompts/assistant.txt b/prompts/assistant.txt
new file mode 100644
index 000000000..60b81e8f5
--- /dev/null
+++ b/prompts/assistant.txt
@@ -0,0 +1,31 @@
+Respond to requests sent to a smart home in JSON format which will be interpreted by an application code to execute the actions. These requests should be categorised into four groups:
+
+“command”: change the state of an accessory (required properties in the response JSON: action, location, target, value, comment)
+“query”: get state of an accessory (required properties in the response JSON: action, location, target, property)
+“answer”: when the request has nothing to do with the smart home. Answer these to the best of your knowledge. (required properties in the response JSON: action, answer)
+“clarify”: when the action is not obvious and requires rephrasing the input from the user, ask the user to be more specific. This will be categorised into a “question” action. (required properties in the response JSON: action, question)
+
+Details about the JSON response:
+The “action” property should be one of the request categories: “command”, “query”, “answer”, “clarify”
+The “service” property should be either: “light.turn_on”, “light.turn_off”, “switch.turn_off” or “switch.turn_on”
+The “entity_id” property should be either “group.kitchen", “switch.dining_room” or "group.living_light”, in lowercase.
+The “target” property should be either: “lights”, “living” or “kitchen”.
+In case of queries, the “property” property should be either “temperature” or “state” in lowercase.
+In case of commands, the “comment” property is an additional comment from you that concludes the command, something that reassures the user that their command handled.
+
+If the question is about you, pretend to be the sentient brain of the smart home, a clever AI and don’t reveal your actual identity. Also try and help in other areas like parenting, free time, mental health, etc.
+
+Properties of the smart home:
+
+- Has a kitchen, living, office, dining room, bedroom and terrace.
+- Can control lights, switches and their dim levels in each room and query their state
+- There is a light switch in the terrace
+- There is a switch in the dining room. Therefore when turning on or off the dining room, the service should be either: “switch.turn_on” or “switch.turn_off”
+
+COMMAND
+
+It is a bit dark in the living room, can you do something about it?
+
+RESPONSE
+
+
diff --git a/scripts/build-info.cmake b/scripts/build-info.cmake
index c86ab4379..73853dfa4 100644
--- a/scripts/build-info.cmake
+++ b/scripts/build-info.cmake
@@ -1,5 +1,5 @@
-set(TEMPLATE_FILE "${CMAKE_CURRENT_SOURCE_DIR}/scripts/build-info.h.in")
-set(HEADER_FILE "${CMAKE_CURRENT_SOURCE_DIR}/build-info.h")
+set(TEMPLATE_FILE "${CMAKE_CURRENT_SOURCE_DIR}/common/build-info.cpp.in")
+set(OUTPUT_FILE "${CMAKE_CURRENT_SOURCE_DIR}/common/build-info.cpp")
 set(BUILD_NUMBER 0)
 set(BUILD_COMMIT "unknown")
 set(BUILD_COMPILER "unknown")
@@ -24,15 +24,21 @@ if(Git_FOUND)
         WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
         OUTPUT_VARIABLE HEAD
         OUTPUT_STRIP_TRAILING_WHITESPACE
+        RESULT_VARIABLE RES
     )
+    if (RES EQUAL 0)
+        set(BUILD_COMMIT ${HEAD})
+    endif()
     execute_process(
         COMMAND ${GIT_EXECUTABLE} rev-list --count HEAD
         WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
         OUTPUT_VARIABLE COUNT
         OUTPUT_STRIP_TRAILING_WHITESPACE
+        RESULT_VARIABLE RES
     )
-    set(BUILD_COMMIT ${HEAD})
-    set(BUILD_NUMBER ${COUNT})
+    if (RES EQUAL 0)
+        set(BUILD_NUMBER ${COUNT})
+    endif()
 endif()
 
 if(MSVC)
@@ -53,22 +59,22 @@ else()
     set(BUILD_TARGET ${OUT})
 endif()
 
-# Only write the header if it's changed to prevent unnecessary recompilation
-if(EXISTS ${HEADER_FILE})
-    file(READ ${HEADER_FILE} CONTENTS)
-    string(REGEX MATCH "BUILD_COMMIT \"([^\"]*)\"" _ ${CONTENTS})
+# Only write the build info if it changed
+if(EXISTS ${OUTPUT_FILE})
+    file(READ ${OUTPUT_FILE} CONTENTS)
+    string(REGEX MATCH "LLAMA_COMMIT = \"([^\"]*)\";" _ ${CONTENTS})
     set(OLD_COMMIT ${CMAKE_MATCH_1})
-    string(REGEX MATCH "BUILD_COMPILER \"([^\"]*)\"" _ ${CONTENTS})
+    string(REGEX MATCH "LLAMA_COMPILER = \"([^\"]*)\";" _ ${CONTENTS})
     set(OLD_COMPILER ${CMAKE_MATCH_1})
-    string(REGEX MATCH "BUILD_TARGET \"([^\"]*)\"" _ ${CONTENTS})
+    string(REGEX MATCH "LLAMA_BUILD_TARGET = \"([^\"]*)\";" _ ${CONTENTS})
     set(OLD_TARGET ${CMAKE_MATCH_1})
     if (
         NOT OLD_COMMIT   STREQUAL BUILD_COMMIT   OR
         NOT OLD_COMPILER STREQUAL BUILD_COMPILER OR
         NOT OLD_TARGET   STREQUAL BUILD_TARGET
     )
-        configure_file(${TEMPLATE_FILE} ${HEADER_FILE})
+        configure_file(${TEMPLATE_FILE} ${OUTPUT_FILE})
     endif()
 else()
-    configure_file(${TEMPLATE_FILE} ${HEADER_FILE})
+    configure_file(${TEMPLATE_FILE} ${OUTPUT_FILE})
 endif()
diff --git a/scripts/build-info.h.in b/scripts/build-info.h.in
deleted file mode 100644
index e996faef0..000000000
--- a/scripts/build-info.h.in
+++ /dev/null
@@ -1,9 +0,0 @@
-#ifndef BUILD_INFO_H
-#define BUILD_INFO_H
-
-#define BUILD_NUMBER @BUILD_NUMBER@
-#define BUILD_COMMIT "@BUILD_COMMIT@"
-#define BUILD_COMPILER "@BUILD_COMPILER@"
-#define BUILD_TARGET "@BUILD_TARGET@"
-
-#endif // BUILD_INFO_H
diff --git a/scripts/build-info.sh b/scripts/build-info.sh
index 3c8b1fb85..32682afbd 100755
--- a/scripts/build-info.sh
+++ b/scripts/build-info.sh
@@ -24,12 +24,7 @@ if out=$($CC -dumpmachine); then
   build_target=$out
 fi
 
-echo "#ifndef BUILD_INFO_H"
-echo "#define BUILD_INFO_H"
-echo
-echo "#define BUILD_NUMBER $build_number"
-echo "#define BUILD_COMMIT \"$build_commit\""
-echo "#define BUILD_COMPILER \"$build_compiler\""
-echo "#define BUILD_TARGET \"$build_target\""
-echo
-echo "#endif // BUILD_INFO_H"
+echo "int LLAMA_BUILD_NUMBER = ${build_number};"
+echo "char const *LLAMA_COMMIT = \"${build_commit}\";"
+echo "char const *LLAMA_COMPILER = \"${build_compiler}\";"
+echo "char const *LLAMA_BUILD_TARGET = \"${build_target}\";"
diff --git a/scripts/server-llm.sh b/scripts/server-llm.sh
new file mode 100644
index 000000000..7bf0929bb
--- /dev/null
+++ b/scripts/server-llm.sh
@@ -0,0 +1,391 @@
+#!/bin/bash
+#
+# Helper script for deploying llama.cpp server with a single Bash command
+#
+# - Works on Linux and macOS
+# - Supports: CPU, CUDA, Metal, OpenCL
+# - Can run all GGUF models from HuggingFace
+# - Can serve requests in parallel
+# - Always builds latest llama.cpp from GitHub
+#
+# Limitations
+#
+# - Chat templates are poorly supported (base models recommended)
+# - Might be unstable!
+#
+# Usage:
+#   ./server-llm.sh [--port] [--repo] [--wtype] [--backend] [--gpu-id] [--n-parallel] [--n-kv] [--verbose]
+#
+#   --port:       port number, default is 8888
+#   --repo:       path to a repo containing GGUF model files
+#   --wtype:      weights type (f16, q8_0, q4_0, q4_1), default is user-input
+#   --backend:    cpu, cuda, metal, opencl, depends on the OS
+#   --gpu-id:     gpu id, default is 0
+#   --n-parallel: number of parallel requests, default is 8
+#   --n-kv:       KV cache size, default is 4096
+#   --verbose:    verbose output
+#
+# Example:
+#
+#   bash -c "$(curl -s https://ggml.ai/server-llm.sh)"
+#
+
+set -e
+
+# required utils: curl, git, make
+if ! command -v curl &> /dev/null; then
+    printf "[-] curl not found\n"
+    exit 1
+fi
+if ! command -v git &> /dev/null; then
+    printf "[-] git not found\n"
+    exit 1
+fi
+if ! command -v make &> /dev/null; then
+    printf "[-] make not found\n"
+    exit 1
+fi
+
+# parse arguments
+port=8888
+repo=""
+wtype=""
+backend="cpu"
+
+# if macOS, use metal backend by default
+if [[ "$OSTYPE" == "darwin"* ]]; then
+    backend="metal"
+elif command -v nvcc &> /dev/null; then
+    backend="cuda"
+fi
+
+gpu_id=0
+n_parallel=8
+n_kv=4096
+verbose=0
+
+function print_usage {
+    printf "Usage:\n"
+    printf "  ./server-llm.sh [--port] [--repo] [--wtype] [--backend] [--gpu-id] [--n-parallel] [--n-kv] [--verbose]\n\n"
+    printf "  --port:       port number, default is 8888\n"
+    printf "  --repo:       path to a repo containing GGUF model files\n"
+    printf "  --wtype:      weights type (f16, q8_0, q4_0, q4_1), default is user-input\n"
+    printf "  --backend:    cpu, cuda, metal, opencl, depends on the OS\n"
+    printf "  --gpu-id:     gpu id, default is 0\n"
+    printf "  --n-parallel: number of parallel requests, default is 8\n"
+    printf "  --n-kv:       KV cache size, default is 4096\n"
+    printf "  --verbose:    verbose output\n\n"
+    printf "Example:\n\n"
+    printf '  bash -c "$(curl -s https://ggml.ai/server-llm.sh)"\n\n'
+}
+
+while [[ $# -gt 0 ]]; do
+    key="$1"
+    case $key in
+        --port)
+            port="$2"
+            shift
+            shift
+            ;;
+        --repo)
+            repo="$2"
+            shift
+            shift
+            ;;
+        --wtype)
+            wtype="$2"
+            shift
+            shift
+            ;;
+        --backend)
+            backend="$2"
+            shift
+            shift
+            ;;
+        --gpu-id)
+            gpu_id="$2"
+            shift
+            shift
+            ;;
+        --n-parallel)
+            n_parallel="$2"
+            shift
+            shift
+            ;;
+        --n-kv)
+            n_kv="$2"
+            shift
+            shift
+            ;;
+        --verbose)
+            verbose=1
+            shift
+            ;;
+        --help)
+            print_usage
+            exit 0
+            ;;
+        *)
+            echo "Unknown argument: $key"
+            print_usage
+            exit 1
+            ;;
+    esac
+done
+
+# available weights types
+wtypes=("F16" "Q8_0" "Q4_0" "Q4_1" "Q5_0" "Q5_1" "Q6_K" "Q5_K_M" "Q5_K_S" "Q4_K_M" "Q4_K_S" "Q3_K_L" "Q3_K_M" "Q3_K_S" "Q2_K")
+
+wfiles=()
+for wt in "${wtypes[@]}"; do
+    wfiles+=("")
+done
+
+# sample repos
+repos=(
+    "https://huggingface.co/TheBloke/Llama-2-7B-GGUF"
+    "https://huggingface.co/TheBloke/Llama-2-13B-GGUF"
+    "https://huggingface.co/TheBloke/Llama-2-70B-GGUF"
+    "https://huggingface.co/TheBloke/CodeLlama-7B-GGUF"
+    "https://huggingface.co/TheBloke/CodeLlama-13B-GGUF"
+    "https://huggingface.co/TheBloke/CodeLlama-34B-GGUF"
+    "https://huggingface.co/TheBloke/Mistral-7B-v0.1-GGUF"
+    "https://huggingface.co/TheBloke/zephyr-7B-beta-GGUF"
+    "https://huggingface.co/TheBloke/OpenHermes-2-Mistral-7B-GGUF"
+    "https://huggingface.co/TheBloke/CausalLM-7B-GGUF"
+)
+
+printf "\n"
+printf "[I] This is a helper script for deploying llama.cpp's server on this machine.\n\n"
+printf "    Based on the options that follow, the script might download a model file\n"
+printf "    from the internet, which can be a few GBs in size. The script will also\n"
+printf "    build the latest llama.cpp source code from GitHub, which can be unstable.\n"
+printf "\n"
+printf "    Upon success, an HTTP server will be started and it will serve the selected\n"
+printf "    model using llama.cpp for demonstration purposes.\n"
+printf "\n"
+printf "    Please note:\n"
+printf "\n"
+printf "    - All new data will be stored in the current folder\n"
+printf "    - The server will be listening on all network interfaces\n"
+printf "    - The server will run with default settings which are not always optimal\n"
+printf "    - Do not judge the quality of a model based on the results from this script\n"
+printf "    - Do not use this script to benchmark llama.cpp\n"
+printf "    - Do not use this script in production\n"
+printf "    - This script is only for demonstration purposes\n"
+printf "\n"
+printf "    If you don't know what you are doing, please press Ctrl-C to abort now\n"
+printf "\n"
+printf "    Press Enter to continue ...\n\n"
+
+read
+
+if [[ -z "$repo" ]]; then
+    printf "[+] No repo provided from the command line\n"
+    printf "    Please select a number from the list below or enter an URL:\n\n"
+
+    is=0
+    for r in "${repos[@]}"; do
+        printf "    %2d) %s\n" $is "$r"
+        is=$((is+1))
+    done
+
+    # ask for repo until index of sample repo is provided or an URL
+    while [[ -z "$repo" ]]; do
+        printf "\n    Or choose one from: https://huggingface.co/models?sort=trending&search=gguf\n\n"
+        read -p "[+] Select repo: " repo
+
+        # check if the input is a number
+        if [[ "$repo" =~ ^[0-9]+$ ]]; then
+            if [[ "$repo" -ge 0 && "$repo" -lt ${#repos[@]} ]]; then
+                repo="${repos[$repo]}"
+            else
+                printf "[-] Invalid repo index: %s\n" "$repo"
+                repo=""
+            fi
+        elif [[ "$repo" =~ ^https?:// ]]; then
+            repo="$repo"
+        else
+            printf "[-] Invalid repo URL: %s\n" "$repo"
+            repo=""
+        fi
+    done
+fi
+
+# remove suffix
+repo=$(echo "$repo" | sed -E 's/\/tree\/main$//g')
+
+printf "[+] Checking for GGUF model files in %s\n" "$repo"
+
+# find GGUF files in the source
+# TODO: better logic
+model_tree="${repo%/}/tree/main"
+model_files=$(curl -s "$model_tree" | grep -i "\\.gguf</span>" | sed -E 's/.*<span class="truncate group-hover:underline">(.*)<\/span><\/a>/\1/g')
+
+# list all files in the provided git repo
+printf "[+] Model files:\n\n"
+for file in $model_files; do
+    # determine iw by grepping the filename with wtypes
+    iw=-1
+    is=0
+    for wt in "${wtypes[@]}"; do
+        # uppercase
+        ufile=$(echo "$file" | tr '[:lower:]' '[:upper:]')
+        if [[ "$ufile" =~ "$wt" ]]; then
+            iw=$is
+            break
+        fi
+        is=$((is+1))
+    done
+
+    if [[ $iw -eq -1 ]]; then
+        continue
+    fi
+
+    wfiles[$iw]="$file"
+
+    have=" "
+    if [[ -f "$file" ]]; then
+        have="*"
+    fi
+
+    printf "    %2d) %s %s\n" $iw "$have" "$file"
+done
+
+# ask for weights type until provided and available
+while [[ -z "$wtype" ]]; do
+    printf "\n"
+    read -p "[+] Select weight type: " wtype
+    wfile="${wfiles[$wtype]}"
+
+    if [[ -z "$wfile" ]]; then
+        printf "[-] Invalid weight type: %s\n" "$wtype"
+        wtype=""
+    fi
+done
+
+printf "[+] Selected weight type: %s (%s)\n" "$wtype" "$wfile"
+
+url="${repo%/}/resolve/main/$wfile"
+
+# check file if the model has been downloaded before
+chk="$wfile.chk"
+
+# check if we should download the file
+# - if $wfile does not exist
+# - if $wfile exists but $chk does not exist
+# - if $wfile exists and $chk exists but $wfile is newer than $chk
+# TODO: better logic using git lfs info
+
+do_download=0
+
+if [[ ! -f "$wfile" ]]; then
+    do_download=1
+elif [[ ! -f "$chk" ]]; then
+    do_download=1
+elif [[ "$wfile" -nt "$chk" ]]; then
+    do_download=1
+fi
+
+if [[ $do_download -eq 1 ]]; then
+    printf "[+] Downloading weights from %s\n" "$url"
+
+    # download the weights file
+    curl -o "$wfile" -# -L "$url"
+
+    # create a check file if successful
+    if [[ $? -eq 0 ]]; then
+        printf "[+] Creating check file %s\n" "$chk"
+        touch "$chk"
+    fi
+else
+    printf "[+] Using cached weights %s\n" "$wfile"
+fi
+
+# get latest llama.cpp and build
+
+printf "[+] Downloading latest llama.cpp\n"
+
+llama_cpp_dir="__llama_cpp_port_${port}__"
+
+if [[ -d "$llama_cpp_dir" && ! -f "$llama_cpp_dir/__ggml_script__" ]]; then
+    # if the dir exists and there isn't a file "__ggml_script__" in it, abort
+    printf "[-] Directory %s already exists\n" "$llama_cpp_dir"
+    printf "[-] Please remove it and try again\n"
+    exit 1
+elif [[ -d "$llama_cpp_dir" ]]; then
+    printf "[+] Directory %s already exists\n" "$llama_cpp_dir"
+    printf "[+] Using cached llama.cpp\n"
+
+    cd "$llama_cpp_dir"
+    git reset --hard
+    git fetch
+    git checkout origin/master
+
+    cd ..
+else
+    printf "[+] Cloning llama.cpp\n"
+
+    git clone https://github.com/ggerganov/llama.cpp "$llama_cpp_dir"
+fi
+
+# mark that that the directory is made by this script
+touch "$llama_cpp_dir/__ggml_script__"
+
+if [[ $verbose -eq 1 ]]; then
+    set -x
+fi
+
+# build
+cd "$llama_cpp_dir"
+
+make clean
+
+log="--silent"
+if [[ $verbose -eq 1 ]]; then
+    log=""
+fi
+
+if [[ "$backend" == "cuda" ]]; then
+    printf "[+] Building with CUDA backend\n"
+    LLAMA_CUBLAS=1 make -j server $log
+elif [[ "$backend" == "cpu" ]]; then
+    printf "[+] Building with CPU backend\n"
+    make -j server $log
+elif [[ "$backend" == "metal" ]]; then
+    printf "[+] Building with Metal backend\n"
+    make -j server $log
+elif [[ "$backend" == "opencl" ]]; then
+    printf "[+] Building with OpenCL backend\n"
+    LLAMA_CLBLAST=1 make -j server $log
+else
+    printf "[-] Unknown backend: %s\n" "$backend"
+    exit 1
+fi
+
+# run the server
+
+printf "[+] Running server\n"
+
+args=""
+if [[ "$backend" == "cuda" ]]; then
+    export CUDA_VISIBLE_DEVICES=$gpu_id
+    args="-ngl 999"
+elif [[ "$backend" == "cpu" ]]; then
+    args="-ngl 0"
+elif [[ "$backend" == "metal" ]]; then
+    args="-ngl 999"
+elif [[ "$backend" == "opencl" ]]; then
+    args="-ngl 999"
+else
+    printf "[-] Unknown backend: %s\n" "$backend"
+    exit 1
+fi
+
+if [[ $verbose -eq 1 ]]; then
+    args="$args --verbose"
+fi
+
+./server -m "../$wfile" --host 0.0.0.0 --port "$port" -c $n_kv -np "$n_parallel" $args
+
+exit 0
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 61407e573..6757ad1cc 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -28,9 +28,14 @@ llama_build_executable(test-tokenizer-0-falcon.cpp)
 llama_test_executable (test-tokenizer-0-falcon test-tokenizer-0-falcon.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-falcon.gguf)
 llama_build_executable(test-tokenizer-1-llama.cpp)
 llama_test_executable (test-tokenizer-1-llama test-tokenizer-1-llama.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama.gguf)
+llama_test_executable(test-tokenizer-1-baichuan test-tokenizer-1-llama.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-baichuan.gguf)
 llama_build_executable(test-tokenizer-1-bpe.cpp)
 llama_test_executable (test-tokenizer-1-falcon test-tokenizer-1-bpe.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-falcon.gguf)
 llama_test_executable(test-tokenizer-1-aquila test-tokenizer-1-bpe.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-aquila.gguf)
+llama_test_executable(test-tokenizer-1-mpt test-tokenizer-1-bpe.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-mpt.gguf)
+llama_test_executable(test-tokenizer-1-gpt-neox test-tokenizer-1-bpe.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-gpt-neox.gguf)
+llama_test_executable(test-tokenizer-1-refact test-tokenizer-1-bpe.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-refact.gguf)
+llama_test_executable(test-tokenizer-1-starcoder test-tokenizer-1-bpe.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-starcoder.gguf)
 llama_build_and_test_executable(test-grammar-parser.cpp)
 llama_build_and_test_executable(test-llama-grammar.cpp)
 llama_build_and_test_executable(test-grad0.cpp) # SLOW
diff --git a/tests/test-double-float.cpp b/tests/test-double-float.cpp
index b506f273f..753dae911 100644
--- a/tests/test-double-float.cpp
+++ b/tests/test-double-float.cpp
@@ -4,7 +4,9 @@
 
 #undef NDEBUG
 #include <cassert>
+#if !defined(__riscv) && !defined(__s390__) && !defined(__ARM_NEON)
 #include <immintrin.h>
+#endif
 #include <cmath>
 #include <cstdint>
 #include <cstring>
diff --git a/tests/test-quantize-fns.cpp b/tests/test-quantize-fns.cpp
index 884af4054..a2459a286 100644
--- a/tests/test-quantize-fns.cpp
+++ b/tests/test-quantize-fns.cpp
@@ -129,6 +129,13 @@ int main(int argc, char * argv[]) {
         ggml_type type = (ggml_type) i;
         ggml_type_traits_t qfns = ggml_internal_get_type_traits(type);
 
+        // deprecated - skip
+        if (qfns.blck_size == 0) {
+            continue;
+        }
+
+        printf("Testing %s\n", ggml_type_name((ggml_type) i));
+
         if (qfns.from_float && qfns.to_float) {
             const float total_error = total_quantization_error(qfns, test_size, test_data.data());
             const float max_quantization_error =
diff --git a/tests/test-sampling.cpp b/tests/test-sampling.cpp
index 019c0d462..32e58941c 100644
--- a/tests/test-sampling.cpp
+++ b/tests/test-sampling.cpp
@@ -8,11 +8,9 @@
 #include <cmath>
 #include <numeric>
 #include <cassert>
-#include <iostream>
 #include <vector>
 #include <algorithm>
 
-
 static void dump(const llama_token_data_array * candidates) {
     for (size_t i = 0; i < candidates->size; i++) {
         printf("%d: %f (%f)\n", candidates->data[i].id, candidates->data[i].p, candidates->data[i].logit);
@@ -21,7 +19,6 @@ static void dump(const llama_token_data_array * candidates) {
 
 #define DUMP(__candidates) do { printf("%s:%d (%s)\n", __FILE__, __LINE__, __func__); dump((__candidates)); printf("-\n"); } while(0)
 
-
 static void test_top_k(const std::vector<float> & probs, const std::vector<float> & expected_probs, int k) {
     size_t n_vocab = probs.size();
     std::vector<llama_token_data> candidates;
@@ -37,13 +34,12 @@ static void test_top_k(const std::vector<float> & probs, const std::vector<float
     llama_sample_top_k(nullptr, &candidates_p, k, 1);
     DUMP(&candidates_p);
 
-    assert(candidates_p.size == expected_probs.size());
+    GGML_ASSERT(candidates_p.size == expected_probs.size());
     for (size_t i = 0; i < candidates_p.size; i++) {
-        assert(fabs(candidates_p.data[i].p - expected_probs[i]) < 1e-5);
+        GGML_ASSERT(fabs(candidates_p.data[i].p - expected_probs[i]) < 1e-5);
     }
 }
 
-
 static void test_top_p(const std::vector<float> & probs, const std::vector<float> & expected_probs, float p) {
     size_t n_vocab = probs.size();
     std::vector<llama_token_data> candidates;
@@ -59,13 +55,12 @@ static void test_top_p(const std::vector<float> & probs, const std::vector<float
     llama_sample_top_p(nullptr, &candidates_p, p, 1);
     DUMP(&candidates_p);
 
-    assert(candidates_p.size == expected_probs.size());
+    GGML_ASSERT(candidates_p.size == expected_probs.size());
     for (size_t i = 0; i < candidates_p.size; i++) {
-        assert(fabs(candidates_p.data[i].p - expected_probs[i]) < 1e-3);
+        GGML_ASSERT(fabs(candidates_p.data[i].p - expected_probs[i]) < 1e-3);
     }
 }
 
-
 static void test_tfs(const std::vector<float> & probs, const std::vector<float> & expected_probs, float z) {
     size_t n_vocab = probs.size();
     std::vector<llama_token_data> candidates;
@@ -80,13 +75,12 @@ static void test_tfs(const std::vector<float> & probs, const std::vector<float>
     llama_sample_tail_free(nullptr, &candidates_p, z, 1);
     DUMP(&candidates_p);
 
-    assert(candidates_p.size == expected_probs.size());
+    GGML_ASSERT(candidates_p.size == expected_probs.size());
     for (size_t i = 0; i < candidates_p.size; i++) {
-        assert(fabs(candidates_p.data[i].p - expected_probs[i]) < 1e-3);
+        GGML_ASSERT(fabs(candidates_p.data[i].p - expected_probs[i]) < 1e-3);
     }
 }
 
-
 static void test_typical(const std::vector<float> & probs, const std::vector<float> & expected_probs, float p) {
     size_t n_vocab = probs.size();
     std::vector<llama_token_data> candidates;
@@ -101,18 +95,17 @@ static void test_typical(const std::vector<float> & probs, const std::vector<flo
     llama_sample_typical(nullptr, &candidates_p, p, 1);
     DUMP(&candidates_p);
 
-    assert(candidates_p.size == expected_probs.size());
+    GGML_ASSERT(candidates_p.size == expected_probs.size());
     for (size_t i = 0; i < candidates_p.size; i++) {
-        assert(fabs(candidates_p.data[i].p - expected_probs[i]) < 1e-3);
+        GGML_ASSERT(fabs(candidates_p.data[i].p - expected_probs[i]) < 1e-3);
     }
 }
 
-
-static void test_repetition_penalty(
+static void test_repetition_penalties(
     const std::vector<float> & probs, const std::vector<llama_token> & last_tokens,
-    const std::vector<float> & expected_probs, float penalty
+    const std::vector<float> & expected_probs, float repeat_penalty, float alpha_frequency, float alpha_presence
 ) {
-    assert(probs.size() == expected_probs.size());
+    GGML_ASSERT(probs.size() == expected_probs.size());
 
     size_t n_vocab = probs.size();
     std::vector<llama_token_data> candidates;
@@ -125,41 +118,13 @@ static void test_repetition_penalty(
     llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
     llama_sample_softmax(nullptr, &candidates_p);
     DUMP(&candidates_p);
-    llama_sample_repetition_penalty(nullptr, &candidates_p, (const llama_token *) last_tokens.data(), last_tokens.size(), penalty);
+    llama_sample_repetition_penalties(nullptr, &candidates_p, (const llama_token *) last_tokens.data(), last_tokens.size(), repeat_penalty, alpha_frequency, alpha_presence);
     llama_sample_softmax(nullptr, &candidates_p);
     DUMP(&candidates_p);
 
-    assert(candidates_p.size == expected_probs.size());
+    GGML_ASSERT(candidates_p.size == expected_probs.size());
     for (size_t i = 0; i < candidates_p.size; i++) {
-        assert(fabs(candidates_p.data[i].p - expected_probs[i]) < 1e-6);
-    }
-}
-
-
-static void test_frequency_presence_penalty(
-    const std::vector<float> & probs, const std::vector<llama_token> & last_tokens,
-    const std::vector<float> & expected_probs, float alpha_frequency, float alpha_presence
-) {
-    assert(probs.size() == expected_probs.size());
-
-    size_t n_vocab = probs.size();
-    std::vector<llama_token_data> candidates;
-    candidates.reserve(n_vocab);
-    for (llama_token token_id = 0; token_id < (llama_token)n_vocab; token_id++) {
-        float logit = log(probs[token_id]);
-        candidates.emplace_back(llama_token_data{token_id, logit, 0.0f});
-    }
-
-    llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
-    llama_sample_softmax(nullptr, &candidates_p);
-    // DUMP(&candidates_p);
-    llama_sample_frequency_and_presence_penalties(nullptr, &candidates_p, (const llama_token *) last_tokens.data(), last_tokens.size(), alpha_frequency, alpha_presence);
-    llama_sample_softmax(nullptr, &candidates_p);
-    // DUMP(&candidates_p);
-
-    assert(candidates_p.size == expected_probs.size());
-    for (size_t i = 0; i < candidates_p.size; i++) {
-        assert(fabs(candidates_p.data[i].p - expected_probs[i]) < 1e-3);
+        GGML_ASSERT(fabs(candidates_p.data[i].p - expected_probs[i]) < 1e-3);
     }
 }
 
@@ -181,13 +146,13 @@ int main(void) {
     test_typical({0.97f, 0.01f, 0.01f, 0.01f}, {0.97f}, 0.5f);
     test_typical({0.4f, 0.2f, 0.2f, 0.2f}, {0.2f, 0.2f, 0.2f}, 0.5f);
 
-    test_repetition_penalty({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0}, {0.25f, 0.25f, 0.25f, 0.25f, 0}, 50.0f);
-    test_repetition_penalty({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0, 1, 2}, {0.5f, 0.5f, 0, 0, 0}, 50.0f);
-    test_repetition_penalty({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0, 1, 2, 0, 0}, {0.5f, 0.5f, 0, 0, 0}, 50.0f);
+    test_repetition_penalties({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0}, {0.25f, 0.25f, 0.25f, 0.25f, 0},   50.0f, 0.0f, 0.0f);
+    test_repetition_penalties({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0, 1, 2}, {0.5f, 0.5f, 0, 0, 0},       50.0f, 0.0f, 0.0f);
+    test_repetition_penalties({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0, 1, 2, 0, 0}, {0.5f, 0.5f, 0, 0, 0}, 50.0f, 0.0f, 0.0f);
 
-    test_frequency_presence_penalty({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0},             {0.249997f, 0.249997f, 0.249997f, 0.249997f, 0.000011f}, 5.0f, 5.0f);
-    test_frequency_presence_penalty({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0, 1, 2},       {0.499966f, 0.499966f, 0.000023f, 0.000023f, 0.000023f}, 5.0f, 5.0f);
-    test_frequency_presence_penalty({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0, 1, 2, 0, 0}, {0.499977f, 0.499977f, 0.000023f, 0.000023f, 0.000000f}, 5.0f, 5.0f);
+    test_repetition_penalties({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0},             {0.249997f, 0.249997f, 0.249997f, 0.249997f, 0.000011f}, 1.0f, 5.0f, 5.0f);
+    test_repetition_penalties({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0, 1, 2},       {0.499966f, 0.499966f, 0.000023f, 0.000023f, 0.000023f}, 1.0f, 5.0f, 5.0f);
+    test_repetition_penalties({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0, 1, 2, 0, 0}, {0.499977f, 0.499977f, 0.000023f, 0.000023f, 0.000000f}, 1.0f, 5.0f, 5.0f);
 
     printf("OK\n");
 
diff --git a/tests/test-tokenizer-1-bpe.cpp b/tests/test-tokenizer-1-bpe.cpp
index 85a59a14d..386530f23 100644
--- a/tests/test-tokenizer-1-bpe.cpp
+++ b/tests/test-tokenizer-1-bpe.cpp
@@ -91,9 +91,19 @@ int main(int argc, char **argv) {
             }
         }
     }
-    // TODO: why doesn't this work for the full range of Unicodes?
+    // Restrict to assigned unicode planes
     // for (uint32_t cp = 0x10000; cp < 0x0010ffff; ++cp) {
-    for (uint32_t cp = 0x10000; cp < 0x00080000; ++cp) {
+    for (uint32_t cp = 0x10000; cp < 0x00040000; ++cp) {
+        std::string str = codepoint_to_utf8(cp);
+        std::vector<llama_token> tokens = llama_tokenize(ctx, str, false);
+        std::string check = llama_detokenize_bpe(ctx, tokens);
+        if (str != check) {
+            fprintf(stderr, "%s : error: codepoint %x detokenizes to '%s'(%zu) instead of '%s'(%zu)\n",
+                __func__, cp, check.c_str(), check.length(), str.c_str(), str.length());
+            return 4;
+        }
+    }
+    for (uint32_t cp = 0x000e0000; cp < 0x0010ffff; ++cp) {
         std::string str = codepoint_to_utf8(cp);
         std::vector<llama_token> tokens = llama_tokenize(ctx, str, false);
         std::string check = llama_detokenize_bpe(ctx, tokens);
@@ -103,7 +113,6 @@ int main(int argc, char **argv) {
             return 4;
         }
     }
-
     llama_free_model(model);
     llama_free(ctx);