diff --git a/.devops/full-cuda.Dockerfile b/.devops/full-cuda.Dockerfile
new file mode 100644
index 000000000..e5fcb37d6
--- /dev/null
+++ b/.devops/full-cuda.Dockerfile
@@ -0,0 +1,33 @@
+ARG UBUNTU_VERSION=22.04
+
+# This needs to generally match the container host's environment.
+ARG CUDA_VERSION=11.7.1
+
+# Target the CUDA build image
+ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
+
+FROM ${BASE_CUDA_DEV_CONTAINER} as build
+
+# Unless otherwise specified, we make a fat build.
+ARG CUDA_DOCKER_ARCH=all
+
+RUN apt-get update && \
+    apt-get install -y build-essential python3 python3-pip
+
+COPY requirements.txt requirements.txt
+
+RUN pip install --upgrade pip setuptools wheel \
+    && pip install -r requirements.txt
+
+WORKDIR /app
+
+COPY . .
+
+# Set nvcc architecture
+ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH}
+# Enable cuBLAS
+ENV LLAMA_CUBLAS=1
+
+RUN make
+
+ENTRYPOINT ["/app/.devops/tools.sh"]
diff --git a/.devops/main-cuda.Dockerfile b/.devops/main-cuda.Dockerfile
new file mode 100644
index 000000000..30c01196a
--- /dev/null
+++ b/.devops/main-cuda.Dockerfile
@@ -0,0 +1,32 @@
+ARG UBUNTU_VERSION=22.04
+# This needs to generally match the container host's environment.
+ARG CUDA_VERSION=11.7.1
+# Target the CUDA build image
+ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
+# Target the CUDA runtime image
+ARG BASE_CUDA_RUN_CONTAINER=nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}
+
+FROM ${BASE_CUDA_DEV_CONTAINER} as build
+
+# Unless otherwise specified, we make a fat build.
+ARG CUDA_DOCKER_ARCH=all
+
+RUN apt-get update && \
+    apt-get install -y build-essential
+
+WORKDIR /app
+
+COPY . .
+
+# Set nvcc architecture
+ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH}
+# Enable cuBLAS
+ENV LLAMA_CUBLAS=1
+
+RUN make
+
+FROM ${BASE_CUDA_RUN_CONTAINER} as runtime
+
+COPY --from=build /app/main /main
+
+ENTRYPOINT [ "/main" ]
diff --git a/.devops/tools.sh b/.devops/tools.sh
index 860a7e891..2787c21fe 100755
--- a/.devops/tools.sh
+++ b/.devops/tools.sh
@@ -10,13 +10,13 @@ shift
 # Join the remaining arguments into a single string
 arg2="$@"
 
-if [[ $arg1 == '--convert' || $arg1 == '-c' ]]; then
-    python3 ./convert.py $arg2
-elif [[ $arg1 == '--quantize' || $arg1 == '-q' ]]; then
-    ./quantize $arg2
-elif [[ $arg1 == '--run' || $arg1 == '-r' ]]; then
-    ./main $arg2
-elif [[ $arg1 == '--all-in-one' || $arg1 == '-a' ]]; then
+if [[ "$arg1" == '--convert' || "$arg1" == '-c' ]]; then
+    python3 ./convert.py "$arg2"
+elif [[ "$arg1" == '--quantize' || "$arg1" == '-q' ]]; then
+    ./quantize "$arg2"
+elif [[ "$arg1" == '--run' || "$arg1" == '-r' ]]; then
+    ./main "$arg2"
+elif [[ "$arg1" == '--all-in-one' || "$arg1" == '-a' ]]; then
     echo "Converting PTH to GGML..."
     for i in `ls $1/$2/ggml-model-f16.bin*`; do
         if [ -f "${i/f16/q4_0}" ]; then
@@ -26,6 +26,8 @@ elif [[ $arg1 == '--all-in-one' || $arg1 == '-a' ]]; then
             ./quantize "$i" "${i/f16/q4_0}" q4_0
         fi
     done
+elif [[ "$arg1" == '--server' || "$arg1" == '-s' ]]; then
+    ./server "$arg2"
 else
     echo "Unknown command: $arg1"
     echo "Available commands: "
@@ -37,4 +39,6 @@ else
     echo "              ex: \"/models/7B/ggml-model-f16.bin\" \"/models/7B/ggml-model-q4_0.bin\" 2"
     echo "  --all-in-one (-a): Execute --convert & --quantize"
     echo "              ex: \"/models/\" 7B"
+    echo "  --server (-s): Run a model on the server"
+    echo "              ex: -m /models/7B/ggml-model-q4_0.bin -c 2048 -ngl 43 -mg 1 --port 8080"
 fi
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index b87ea76bc..b6e21b4ec 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -16,7 +16,10 @@ on:
     paths: ['**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu']
 
 env:
- BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
+  BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
+  GGML_NLOOP: 3
+  GGML_NITER: 1
+  GGML_N_THREADS: 1
 
 jobs:
   ubuntu-focal-make:
@@ -64,7 +67,7 @@ jobs:
         id: cmake_test
         run: |
           cd build
-          ctest --verbose
+          ctest --verbose --timeout 900
 
   ubuntu-latest-cmake-sanitizer:
     runs-on: ubuntu-latest
@@ -95,6 +98,40 @@ jobs:
           cmake .. -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON -DCMAKE_BUILD_TYPE=${{ matrix.build_type }}
           cmake --build . --config ${{ matrix.build_type }}
 
+      - name: Test
+        id: cmake_test
+        run: |
+          cd build
+          ctest --verbose --timeout 900
+
+  ubuntu-latest-cmake-mpi:
+    runs-on: ubuntu-latest
+
+    continue-on-error: true
+
+    strategy:
+      matrix:
+        mpi_library: [mpich, libopenmpi-dev]
+
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v1
+
+      - name: Dependencies
+        id: depends
+        run: |
+          sudo apt-get update
+          sudo apt-get install build-essential ${{ matrix.mpi_library }}
+
+      - name: Build
+        id: cmake_build
+        run: |
+          mkdir build
+          cd build
+          cmake -DLLAMA_MPI=ON ..
+          cmake --build . --config Release
+
       - name: Test
         id: cmake_test
         run: |
@@ -111,6 +148,7 @@ jobs:
 
       - name: Dependencies
         id: depends
+        continue-on-error: true
         run: |
           brew update
 
@@ -129,25 +167,28 @@ jobs:
 
       - name: Dependencies
         id: depends
+        continue-on-error: true
         run: |
           brew update
 
       - name: Build
         id: cmake_build
         run: |
+          sysctl -a
           mkdir build
           cd build
-          cmake -DLLAMA_AVX2=OFF ..
+          cmake -DLLAMA_AVX2=OFF -DLLAMA_FMA=OFF ..
           cmake --build . --config Release
 
       - name: Test
         id: cmake_test
         run: |
           cd build
-          ctest --verbose
+          ctest --verbose --timeout 900
 
   windows-latest-cmake:
     runs-on: windows-latest
+
     env:
       OPENBLAS_VERSION: 0.3.23
       OPENCL_VERSION: 2023.04.17
@@ -246,7 +287,7 @@ jobs:
         if: ${{ matrix.build != 'clblast' && (matrix.build != 'avx512' || env.HAS_AVX512F == '1') }} # Test AVX-512 only when possible
         run: |
           cd build
-          ctest -C Release --verbose
+          ctest -C Release --verbose --timeout 900
 
       - name: Get commit hash
         id: commit
diff --git a/.gitignore b/.gitignore
index 4fccec31b..faec869e0 100644
--- a/.gitignore
+++ b/.gitignore
@@ -20,6 +20,7 @@ build-static/
 build-cublas/
 build-opencl/
 build-metal/
+build-mpi/
 build-no-accel/
 build-sanitize-addr/
 build-sanitize-thread/
diff --git a/CMakeLists.txt b/CMakeLists.txt
index aa8f33b40..86ac8c798 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -68,18 +68,20 @@ option(LLAMA_ACCELERATE                      "llama: enable Accelerate framework
 option(LLAMA_BLAS                            "llama: use BLAS"                                  OFF)
 set(LLAMA_BLAS_VENDOR "Generic" CACHE STRING "llama: BLAS library vendor")
 option(LLAMA_CUBLAS                          "llama: use cuBLAS"                                OFF)
+option(LLAMA_CUDA_FORCE_DMMV                 "llama: use dmmv instead of mmvq CUDA kernels"     OFF)
 set(LLAMA_CUDA_DMMV_X      "32" CACHE STRING "llama: x stride for dmmv CUDA kernels")
-set(LLAMA_CUDA_DMMV_Y       "1" CACHE STRING "llama: y block size for dmmv CUDA kernels")
+set(LLAMA_CUDA_MMV_Y        "1" CACHE STRING "llama: y block size for mmv CUDA kernels")
 option(LLAMA_CUDA_DMMV_F16                   "llama: use 16 bit floats for dmmv CUDA kernels"   OFF)
 set(LLAMA_CUDA_KQUANTS_ITER "2" CACHE STRING "llama: iters./thread per block for Q2_K/Q6_K")
 option(LLAMA_CLBLAST                         "llama: use CLBlast"                               OFF)
 option(LLAMA_METAL                           "llama: use Metal"                                 OFF)
+option(LLAMA_MPI                             "llama: use MPI"                                   OFF)
 option(LLAMA_K_QUANTS                        "llama: use k-quants"                              ON)
 option(LLAMA_QKK_64                          "llama: use super-block size of 64 for k-quants"   OFF)
 
 option(LLAMA_BUILD_TESTS                "llama: build tests"    ${LLAMA_STANDALONE})
 option(LLAMA_BUILD_EXAMPLES             "llama: build examples" ${LLAMA_STANDALONE})
-option(LLAMA_BUILD_SERVER               "llama: build server example"                           OFF)
+option(LLAMA_BUILD_SERVER               "llama: build server example"                           ON)
 
 #
 # Build info header
@@ -216,6 +218,9 @@ if (LLAMA_BLAS)
         message(STATUS "BLAS found, Includes: ${BLAS_INCLUDE_DIRS}")
         add_compile_options(${BLAS_LINKER_FLAGS})
         add_compile_definitions(GGML_USE_OPENBLAS)
+        if (${BLAS_INCLUDE_DIRS} MATCHES "mkl" AND (${LLAMA_BLAS_VENDOR} MATCHES "Generic" OR ${LLAMA_BLAS_VENDOR} MATCHES "Intel"))
+            add_compile_definitions(GGML_BLAS_USE_MKL)
+        endif()
         set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} ${BLAS_LIBRARIES})
         set(LLAMA_EXTRA_INCLUDES ${LLAMA_EXTRA_INCLUDES} ${BLAS_INCLUDE_DIRS})
 
@@ -246,8 +251,14 @@ if (LLAMA_CUBLAS)
         set(GGML_SOURCES_CUDA ggml-cuda.cu ggml-cuda.h)
 
         add_compile_definitions(GGML_USE_CUBLAS)
+        if (LLAMA_CUDA_FORCE_DMMV)
+            add_compile_definitions(GGML_CUDA_FORCE_DMMV)
+        endif()
         add_compile_definitions(GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X})
-        add_compile_definitions(GGML_CUDA_DMMV_Y=${LLAMA_CUDA_DMMV_Y})
+        add_compile_definitions(GGML_CUDA_MMV_Y=${LLAMA_CUDA_MMV_Y})
+        if (DEFINED LLAMA_CUDA_DMMV_Y)
+            add_compile_definitions(GGML_CUDA_MMV_Y=${LLAMA_CUDA_DMMV_Y}) # for backwards compatibility
+        endif()
         if (LLAMA_CUDA_DMMV_F16)
             add_compile_definitions(GGML_CUDA_DMMV_F16)
         endif()
@@ -261,9 +272,9 @@ if (LLAMA_CUBLAS)
 
     if (NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
         if (LLAMA_CUDA_DMMV_F16)
-            set(CMAKE_CUDA_ARCHITECTURES "61") # needed for f16 CUDA intrinsics
+            set(CMAKE_CUDA_ARCHITECTURES "60;61") # needed for f16 CUDA intrinsics
         else()
-            set(CMAKE_CUDA_ARCHITECTURES "52") # lowest CUDA 12 standard
+            set(CMAKE_CUDA_ARCHITECTURES "52;61") # lowest CUDA 12 standard + lowest for integer intrinsics
         endif()
     endif()
     message(STATUS "Using CUDA architectures: ${CMAKE_CUDA_ARCHITECTURES}")
@@ -298,6 +309,28 @@ if (LLAMA_METAL)
         )
 endif()
 
+if (LLAMA_MPI)
+    cmake_minimum_required(VERSION 3.10)
+    find_package(MPI)
+    if (MPI_C_FOUND)
+        message(STATUS "MPI found")
+        set(GGML_SOURCES_MPI ggml-mpi.c ggml-mpi.h)
+        add_compile_definitions(GGML_USE_MPI)
+        add_compile_definitions(${MPI_C_COMPILE_DEFINITIONS})
+        set(cxx_flags ${cxx_flags} -Wno-cast-qual)
+        set(c_flags   ${c_flags}   -Wno-cast-qual)
+        set(LLAMA_EXTRA_LIBS     ${LLAMA_EXTRA_LIBS}     ${MPI_C_LIBRARIES})
+        set(LLAMA_EXTRA_INCLUDES ${LLAMA_EXTRA_INCLUDES} ${MPI_C_INCLUDE_DIRS})
+        # Even if you're only using the C header, C++ programs may bring in MPI
+        # C++ functions, so more linkage is needed
+        if (MPI_CXX_FOUND)
+            set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS}     ${MPI_CXX_LIBRARIES})
+        endif()
+    else()
+        message(WARNING "MPI not found")
+    endif()
+endif()
+
 if (LLAMA_CLBLAST)
     find_package(CLBlast)
     if (CLBlast_FOUND)
@@ -417,11 +450,6 @@ if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm" OR ${CMAKE_SYSTEM_PROCESSOR} MATCHES
     if (MSVC)
         # TODO: arm msvc?
     else()
-        if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "aarch64")
-            # Apple M1, M2, etc.
-            # Raspberry Pi 3, 4, Zero 2 (64-bit)
-            add_compile_options(-mcpu=native)
-        endif()
         if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv6")
             # Raspberry Pi 1, Zero
             add_compile_options(-mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access)
@@ -502,6 +530,7 @@ add_library(ggml OBJECT
             ${GGML_SOURCES_CUDA}
             ${GGML_SOURCES_OPENCL}
             ${GGML_SOURCES_METAL}
+            ${GGML_SOURCES_MPI}
             ${GGML_SOURCES_EXTRA}
             )
 
diff --git a/Makefile b/Makefile
index 8bc3ad7e4..1efda93ef 100644
--- a/Makefile
+++ b/Makefile
@@ -1,11 +1,5 @@
 # Define the default target now so that it is always the first target
-BUILD_TARGETS = main quantize quantize-stats perplexity embedding vdot train-text-from-scratch simple libembdinput.so embd-input-test
-
-ifdef LLAMA_BUILD_SERVER
-	BUILD_TARGETS += server
-	LLAMA_SERVER_VERBOSE ?= 1
-server: private CXXFLAGS += -DSERVER_VERBOSE=$(LLAMA_SERVER_VERBOSE)
-endif
+BUILD_TARGETS = main quantize quantize-stats perplexity embedding vdot train-text-from-scratch simple server libembdinput.so embd-input-test
 
 default: $(BUILD_TARGETS)
 
@@ -61,6 +55,10 @@ else
 	CXXFLAGS += -DNDEBUG
 endif
 
+ifdef LLAMA_SERVER_VERBOSE
+	CXXFLAGS += -DSERVER_VERBOSE=$(LLAMA_SERVER_VERBOSE)
+endif
+
 # warnings
 CFLAGS   += -Wall -Wextra -Wpedantic -Wcast-qual -Wdouble-promotion -Wshadow -Wstrict-prototypes -Wpointer-arith
 CXXFLAGS += -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wno-multichar
@@ -149,9 +147,15 @@ ifndef LLAMA_NO_ACCELERATE
 	endif
 endif # LLAMA_NO_ACCELERATE
 
+ifdef LLAMA_MPI
+	CFLAGS += -DGGML_USE_MPI -Wno-cast-qual
+	CXXFLAGS += -DGGML_USE_MPI -Wno-cast-qual
+	OBJS     += ggml-mpi.o
+endif # LLAMA_MPI
+
 ifdef LLAMA_OPENBLAS
-	CFLAGS  += -DGGML_USE_OPENBLAS -I/usr/local/include/openblas -I/usr/include/openblas
-	LDFLAGS += -lopenblas
+	CFLAGS  += -DGGML_USE_OPENBLAS $(shell pkg-config --cflags openblas)
+	LDFLAGS += $(shell pkg-config --libs openblas)
 endif # LLAMA_OPENBLAS
 
 ifdef LLAMA_BLIS
@@ -165,17 +169,27 @@ ifdef LLAMA_CUBLAS
 	LDFLAGS   += -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/opt/cuda/lib64 -L$(CUDA_PATH)/targets/x86_64-linux/lib
 	OBJS      += ggml-cuda.o
 	NVCC      = nvcc
-	NVCCFLAGS = --forward-unknown-to-host-compiler -arch=native
+	NVCCFLAGS = --forward-unknown-to-host-compiler
+ifdef CUDA_DOCKER_ARCH
+	NVCCFLAGS += -Wno-deprecated-gpu-targets -arch=$(CUDA_DOCKER_ARCH)
+else
+	NVCCFLAGS += -arch=native
+endif # CUDA_DOCKER_ARCH
+ifdef LLAMA_CUDA_FORCE_DMMV
+	NVCCFLAGS += -DGGML_CUDA_FORCE_DMMV
+endif # LLAMA_CUDA_FORCE_DMMV
 ifdef LLAMA_CUDA_DMMV_X
 	NVCCFLAGS += -DGGML_CUDA_DMMV_X=$(LLAMA_CUDA_DMMV_X)
 else
 	NVCCFLAGS += -DGGML_CUDA_DMMV_X=32
 endif # LLAMA_CUDA_DMMV_X
-ifdef LLAMA_CUDA_DMMV_Y
-	NVCCFLAGS += -DGGML_CUDA_DMMV_Y=$(LLAMA_CUDA_DMMV_Y)
+ifdef LLAMA_CUDA_MMV_Y
+	NVCCFLAGS += -DGGML_CUDA_MMV_Y=$(LLAMA_CUDA_MMV_Y)
+else ifdef LLAMA_CUDA_DMMV_Y
+	NVCCFLAGS += -DGGML_CUDA_MMV_Y=$(LLAMA_CUDA_DMMV_Y) # for backwards compatibility
 else
-	NVCCFLAGS += -DGGML_CUDA_DMMV_Y=1
-endif # LLAMA_CUDA_DMMV_Y
+	NVCCFLAGS += -DGGML_CUDA_MMV_Y=1
+endif # LLAMA_CUDA_MMV_Y
 ifdef LLAMA_CUDA_DMMV_F16
 	NVCCFLAGS += -DGGML_CUDA_DMMV_F16
 endif # LLAMA_CUDA_DMMV_F16
@@ -184,6 +198,7 @@ ifdef LLAMA_CUDA_KQUANTS_ITER
 else
 	NVCCFLAGS += -DK_QUANTS_PER_ITERATION=2
 endif
+
 ggml-cuda.o: ggml-cuda.cu ggml-cuda.h
 	$(NVCC) $(NVCCFLAGS) $(CXXFLAGS) -Wno-pedantic -c $< -o $@
 endif # LLAMA_CUBLAS
@@ -208,9 +223,6 @@ ifdef LLAMA_METAL
 	CXXFLAGS += -DGGML_USE_METAL
 	LDFLAGS  += -framework Foundation -framework Metal -framework MetalKit -framework MetalPerformanceShaders
 	OBJS     += ggml-metal.o
-
-ggml-metal.o: ggml-metal.m ggml-metal.h
-	$(CC) $(CFLAGS) -c $< -o $@
 endif # LLAMA_METAL
 
 ifdef LLAMA_VULKAN
@@ -249,6 +261,16 @@ ifneq ($(filter armv8%,$(UNAME_M)),)
 	CFLAGS += -mfp16-format=ieee -mno-unaligned-access
 endif
 
+ifdef LLAMA_METAL
+ggml-metal.o: ggml-metal.m ggml-metal.h
+	$(CC) $(CFLAGS) -c $< -o $@
+endif # LLAMA_METAL
+
+ifdef LLAMA_MPI
+ggml-mpi.o: ggml-mpi.c ggml-mpi.h
+	$(CC) $(CFLAGS) -c $< -o $@
+endif # LLAMA_MPI
+
 ifdef LLAMA_NO_K_QUANTS
 k_quants.o: k_quants.c k_quants.h
 	$(CC) $(CFLAGS) -c $< -o $@
@@ -286,7 +308,7 @@ libllama.so: llama.o ggml.o $(OBJS)
 	$(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS)
 
 clean:
-	rm -vf *.o *.so main quantize quantize-stats perplexity embedding benchmark-matmult save-load-state server vdot train-text-from-scratch embd-input-test build-info.h
+	rm -vf *.o *.so main quantize quantize-stats perplexity embedding benchmark-matmult save-load-state server simple vdot train-text-from-scratch embd-input-test build-info.h
 
 #
 # Examples
diff --git a/README.md b/README.md
index ee56988c7..f45e4bf08 100644
--- a/README.md
+++ b/README.md
@@ -11,6 +11,7 @@ Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++
 
 **Hot topics:**
 
+- Simple web chat example: https://github.com/ggerganov/llama.cpp/pull/1998
 - k-quants now support super-block size of 64: https://github.com/ggerganov/llama.cpp/pull/2001
 - New roadmap: https://github.com/users/ggerganov/projects/7
 - Azure CI brainstorming: https://github.com/ggerganov/llama.cpp/discussions/1985
@@ -85,6 +86,7 @@ as the main playground for developing new features for the [ggml](https://github
 - [X] [OpenBuddy 🐶 (Multilingual)](https://github.com/OpenBuddy/OpenBuddy)
 - [X] [Pygmalion 7B / Metharme 7B](#using-pygmalion-7b--metharme-7b)
 - [X] [WizardLM](https://github.com/nlpxucan/WizardLM)
+- [X] [Baichuan-7B](https://huggingface.co/baichuan-inc/baichuan-7B) and its derivations (such as [baichuan-7b-sft](https://huggingface.co/hiyouga/baichuan-7b-sft))
 
 **Bindings:**
 
@@ -237,7 +239,7 @@ In order to build llama.cpp you have three different options.
 - Using `Zig`:
 
     ```bash
-    zig build -Drelease-fast
+    zig build -Doptimize=ReleaseFast
     ```
 
 ### Metal Build
@@ -266,6 +268,45 @@ Any value larger than 0 will offload the computation to the GPU. For example:
 ./main -m ./models/7B/ggml-model-q4_0.bin -n 128 -ngl 1
 ```
 
+### MPI Build
+
+MPI lets you distribute the computation over a cluster of machines. Because of the serial nature of LLM prediction, this won't yield any end-to-end speed-ups, but it will let you run larger models than would otherwise fit into RAM on a single machine.
+
+First you will need MPI libraries installed on your system. The two most popular (only?) options are [MPICH](https://www.mpich.org) and [OpenMPI](https://www.open-mpi.org). Either can be installed with a package manager (`apt`, Homebrew, MacPorts, etc).
+
+Next you will need to build the project with `LLAMA_MPI` set to true on all machines; if you're building with `make`, you will also need to specify an MPI-capable compiler (when building with CMake, this is configured automatically):
+
+- Using `make`:
+
+  ```bash
+  make CC=mpicc CXX=mpicxx LLAMA_MPI=1
+  ```
+
+- Using `CMake`:
+
+  ```bash
+  cmake -S . -B build -DLLAMA_MPI=ON
+  ```
+
+Once the programs are built, download/convert the weights on all of the machines in your cluster. The paths to the weights and programs should be identical on all machines.
+
+Next, ensure password-less SSH access to each machine from the primary host, and create a `hostfile` with a list of the hostnames and their relative "weights" (slots). If you want to use localhost for computation, use its local subnet IP address rather than the loopback address or "localhost".
+
+Here is an example hostfile:
+
+```
+192.168.0.1:2
+malvolio.local:1
+```
+
+The above will distribute the computation across 2 processes on the first host and 1 process on the second host. Each process will use roughly an equal amount of RAM. Try to keep these numbers small, as inter-process (intra-host) communication is expensive.
+
+Finally, you're ready to run a computation using `mpirun`:
+
+```bash
+mpirun -hostfile hostfile -n 3 ./main -m ./models/7B/ggml-model-q4_0.bin -n 128
+```
+
 ### BLAS Build
 
 Building the program with BLAS support may lead to some performance improvements in prompt processing using batch sizes higher than 32 (the default is 512). BLAS doesn't affect the normal generation performance. There are currently three different implementations of it:
@@ -343,8 +384,9 @@ Building the program with BLAS support may lead to some performance improvements
 
   | Option                  | Legal values           | Default | Description |
   |-------------------------|------------------------|---------|-------------|
+  | LLAMA_CUDA_FORCE_DMMV   | Boolean                |   false | Force the use of dequantization + matrix vector multiplication kernels instead of using kernels that do matrix vector multiplication on quantized data. By default the decision is made based on compute capability (MMVQ for 7.0/Turing/RTX 2000 or higher). Does not affect k-quants. |
   | LLAMA_CUDA_DMMV_X       | Positive integer >= 32 |      32 | Number of values in x direction processed by the CUDA dequantization + matrix vector multiplication kernel per iteration. Increasing this value can improve performance on fast GPUs. Power of 2 heavily recommended. Does not affect k-quants. |
-  | LLAMA_CUDA_DMMV_Y       | Positive integer       |       1 | Block size in y direction for the CUDA dequantization + mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended. Does not affect k-quants. |
+  | LLAMA_CUDA_MMV_Y       | Positive integer       |       1 | Block size in y direction for the CUDA mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended. Does not affect k-quants. |
   | LLAMA_CUDA_DMMV_F16     | Boolean                |   false | If enabled, use half-precision floating point arithmetic for the CUDA dequantization + mul mat vec kernels. Can improve performance on relatively recent GPUs. |
   | LLAMA_CUDA_KQUANTS_ITER | 1 or 2                 |       2 | Number of values processed per iteration and per CUDA thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs. |
 
@@ -598,7 +640,7 @@ Please verify the [sha256 checksums](SHA256SUMS) of all downloaded model files t
 
 ```bash
 # run the verification script
-python3 .\scripts\verify-checksum-models.py
+./scripts/verify-checksum-models.py
 ```
 
 - On linux or macOS it is also possible to run the following commands to verify if you have all possible latest files in your self-installed `./models` subdirectory:
@@ -692,7 +734,7 @@ export LD_LIBRARY_PATH=/vendor/lib64:$LD_LIBRARY_PATH
 
 For easy and swift re-execution, consider documenting this final part in a .sh script file. This will enable you to rerun the process with minimal hassle.
 
-Place your desired model into the `/llama.cpp/models/` directory and execute the `./main (...)` script.
+Place your desired model into the `~/llama.cpp/models/` directory and execute the `./main (...)` script.
 
 ### Docker
 
@@ -728,6 +770,38 @@ or with a light image:
 docker run -v /path/to/models:/models ghcr.io/ggerganov/llama.cpp:light -m /models/7B/ggml-model-q4_0.bin -p "Building a website can be done in 10 simple steps:" -n 512
 ```
 
+### Docker With CUDA
+
+Assuming one has the [nvidia-container-toolkit](https://github.com/NVIDIA/nvidia-container-toolkit) properly installed on Linux, or is using a GPU enabled cloud, `cuBLAS` should be accessible inside the container.
+
+#### Building Locally
+
+```bash
+docker build -t local/llama.cpp:full-cuda -f .devops/full-cuda.Dockerfile .
+docker build -t local/llama.cpp:light-cuda -f .devops/main-cuda.Dockerfile .
+```
+
+You may want to pass in some different `ARGS`, depending on the CUDA environment supported by your container host, as well as the GPU architecture.
+
+The defaults are:
+
+- `CUDA_VERSION` set to `11.7.1`
+- `CUDA_DOCKER_ARCH` set to `all`
+
+The resulting images, are essentially the same as the non-CUDA images:
+
+1. `local/llama.cpp:full-cuda`: This image includes both the main executable file and the tools to convert LLaMA models into ggml and convert into 4-bit quantization.
+2. `local/llama.cpp:light-cuda`: This image only includes the main executable file.
+
+#### Usage
+
+After building locally, Usage is similar to the non-CUDA examples, but you'll need to add the `--gpus` flag. You will also want to use the `--n-gpu-layers` flag.
+
+```bash
+docker run --gpus all -v /path/to/models:/models local/llama.cpp:full-cuda --run -m /models/7B/ggml-model-q4_0.bin -p "Building a website can be done in 10 simple steps:" -n 512 --n-gpu-layers 1
+docker run --gpus all -v /path/to/models:/models local/llama.cpp:light-cuda -m /models/7B/ggml-model-q4_0.bin -p "Building a website can be done in 10 simple steps:" -n 512 --n-gpu-layers 1
+```
+
 ### Contributing
 
 - Contributors can open PRs
@@ -748,5 +822,10 @@ docker run -v /path/to/models:/models ghcr.io/ggerganov/llama.cpp:light -m /mode
 
 ### Docs
 
-- [GGML tips & tricks](https://github.com/ggerganov/llama.cpp/wiki/GGML-Tips-&-Tricks)
+- [main](./examples/main/README.md)
+- [server](./examples/server/README.md)
+- [embd-input](./examples/embd-input/README.md)
+- [jeopardy](./examples/jeopardy/README.md)
+- [BLIS](./docs/BLIS.md)
 - [Performance troubleshooting](./docs/token_generation_performance_tips.md)
+- [GGML tips & tricks](https://github.com/ggerganov/llama.cpp/wiki/GGML-Tips-&-Tricks)
diff --git a/build.zig b/build.zig
index 49c159ebf..2287d2a2c 100644
--- a/build.zig
+++ b/build.zig
@@ -1,9 +1,19 @@
 const std = @import("std");
+const commit_hash = @embedFile(".git/refs/heads/master");
 
-// Zig Version: 0.11.0-dev.3379+629f0d23b
+// Zig Version: 0.11.0-dev.3986+e05c242cd
 pub fn build(b: *std.build.Builder) void {
     const target = b.standardTargetOptions(.{});
     const optimize = b.standardOptimizeOption(.{});
+
+    const config_header = b.addConfigHeader(
+        .{ .style = .blank, .include_path = "build-info.h" },
+        .{
+            .BUILD_NUMBER = 0,
+            .BUILD_COMMIT = commit_hash[0 .. commit_hash.len - 1], // omit newline
+        },
+    );
+
     const lib = b.addStaticLibrary(.{
         .name = "llama",
         .target = target,
@@ -13,24 +23,21 @@ pub fn build(b: *std.build.Builder) void {
     lib.linkLibCpp();
     lib.addIncludePath(".");
     lib.addIncludePath("./examples");
-    lib.addCSourceFiles(&.{
-        "ggml.c",
-    }, &.{"-std=c11"});
-    lib.addCSourceFiles(&.{
-        "llama.cpp",
-    }, &.{"-std=c++11"});
+    lib.addConfigHeader(config_header);
+    lib.addCSourceFiles(&.{"ggml.c"}, &.{"-std=c11"});
+    lib.addCSourceFiles(&.{"llama.cpp"}, &.{"-std=c++11"});
     b.installArtifact(lib);
 
     const examples = .{
         "main",
         "baby-llama",
         "embedding",
-        // "metal",
+        "metal",
         "perplexity",
         "quantize",
         "quantize-stats",
         "save-load-state",
-        // "server",
+        "server",
         "simple",
         "train-text-from-scratch",
     };
@@ -43,16 +50,19 @@ pub fn build(b: *std.build.Builder) void {
         });
         exe.addIncludePath(".");
         exe.addIncludePath("./examples");
+        exe.addConfigHeader(config_header);
         exe.addCSourceFiles(&.{
-            std.fmt.comptimePrint("examples/{s}/{s}.cpp", .{example_name, example_name}),
+            std.fmt.comptimePrint("examples/{s}/{s}.cpp", .{ example_name, example_name }),
             "examples/common.cpp",
         }, &.{"-std=c++11"});
         exe.linkLibrary(lib);
         b.installArtifact(exe);
+
         const run_cmd = b.addRunArtifact(exe);
         run_cmd.step.dependOn(b.getInstallStep());
         if (b.args) |args| run_cmd.addArgs(args);
-        const run_step = b.step("run_" ++ example_name, "Run the app");
+
+        const run_step = b.step("run-" ++ example_name, "Run the app");
         run_step.dependOn(&run_cmd.step);
     }
 }
diff --git a/convert.py b/convert.py
index e340d2273..7a2705e5c 100644
--- a/convert.py
+++ b/convert.py
@@ -136,7 +136,7 @@ def find_n_mult(n_ff: int, n_embd: int) -> int:
         calc_ff = (((8*n_embd) // 3 + n_mult - 1) // n_mult)*n_mult
         if calc_ff == n_ff:
             return n_mult
-    return 1
+    raise Exception(f"failed to find n_mult for (n_ff={n_ff}, n_embd={n_embd}).")
 
 @dataclass
 class Params:
@@ -154,9 +154,15 @@ class Params:
         # try transformer naming first
         if "model.layers.0.self_attn.q_proj.weight" in model:
             n_layer=next(i for i in itertools.count() if f"model.layers.{i}.self_attn.q_proj.weight" not in model)
+        elif "model.layers.0.self_attn.W_pack.weight" in model:   # next: try baichuan naming
+            n_layer=next(i for i in itertools.count() if f"model.layers.{i}.self_attn.W_pack.weight" not in model)
         else:
             n_layer=next(i for i in itertools.count() if f"layers.{i}.attention.wq.weight" not in model)
 
+        if n_layer < 1:
+            raise Exception("failed to guess 'n_layer'. This model is unknown or unsupported.\n"
+                            "Suggestion: provide 'config.json' of the model in the same directory containing model files.")
+
         n_head=n_embd // 128 # guessed
 
         return Params(
@@ -321,6 +327,10 @@ class Tensor(metaclass=ABCMeta):
     @abstractmethod
     def permute(self, n_head: int) -> 'Tensor': ...
     @abstractmethod
+    def permute_part(self, n_part: int, n_head: int) -> 'UnquantizedTensor': ...
+    @abstractmethod
+    def part(self, n_part: int) -> 'UnquantizedTensor': ...
+    @abstractmethod
     def to_ggml(self) -> 'GGMLCompatibleTensor': ...
 
 
@@ -345,6 +355,14 @@ class UnquantizedTensor(Tensor):
     def to_ggml(self) -> 'UnquantizedTensor':
         return self
 
+    def permute_part(self, n_part: int, n_head: int) -> 'UnquantizedTensor':
+        r = self.ndarray.shape[0] // 3
+        return UnquantizedTensor(permute(self.ndarray[r * n_part : r * n_part + r, ...], n_head))
+
+    def part(self, n_part: int) -> 'UnquantizedTensor':
+        r = self.ndarray.shape[0] // 3
+        return UnquantizedTensor(self.ndarray[r * n_part : r * n_part + r, ...])
+
     def permute(self, n_head: int) -> 'UnquantizedTensor':
         return UnquantizedTensor(permute(self.ndarray, n_head))
 
@@ -642,6 +660,19 @@ def permute_lazy(lazy_tensor: LazyTensor, n_head: int) -> LazyTensor:
         return lazy_tensor.load().permute(n_head)
     return LazyTensor(load, lazy_tensor.shape, lazy_tensor.data_type, f'permute({n_head}) ' + lazy_tensor.description)
 
+def permute_part_lazy(lazy_tensor: LazyTensor, n_part: int, n_head: int) -> LazyTensor:
+    def load() -> Tensor:
+        return lazy_tensor.load().permute_part(n_part, n_head)
+    s = lazy_tensor.shape.copy()
+    s[0] = s[0] // 3
+    return LazyTensor(load, s, lazy_tensor.data_type, f'permute({n_head}) ' + lazy_tensor.description)
+
+def part_lazy(lazy_tensor: LazyTensor, n_part: int) -> LazyTensor:
+    def load() -> Tensor:
+        return lazy_tensor.load().part(n_part)
+    s = lazy_tensor.shape.copy()
+    s[0] = s[0] // 3
+    return LazyTensor(load, s, lazy_tensor.data_type, 'part ' + lazy_tensor.description)
 
 def convert_transformers_to_orig(model: LazyModel, params: Params) -> LazyModel:
     out: LazyModel = {}
@@ -650,11 +681,17 @@ def convert_transformers_to_orig(model: LazyModel, params: Params) -> LazyModel:
     out["output.weight"] = model["lm_head.weight"]
 
     for i in itertools.count():
-        if f"model.layers.{i}.self_attn.q_proj.weight" not in model:
+        if f"model.layers.{i}.self_attn.q_proj.weight" in model:
+            out[f"layers.{i}.attention.wq.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.q_proj.weight"], params.n_head)
+            out[f"layers.{i}.attention.wk.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.k_proj.weight"], params.n_head)
+            out[f"layers.{i}.attention.wv.weight"] = model[f"model.layers.{i}.self_attn.v_proj.weight"]
+        elif f"model.layers.{i}.self_attn.W_pack.weight" in model:
+            out[f"layers.{i}.attention.wq.weight"] = permute_part_lazy(model[f"model.layers.{i}.self_attn.W_pack.weight"], 0, params.n_head)
+            out[f"layers.{i}.attention.wk.weight"] = permute_part_lazy(model[f"model.layers.{i}.self_attn.W_pack.weight"], 1, params.n_head)
+            out[f"layers.{i}.attention.wv.weight"] = part_lazy(model[f"model.layers.{i}.self_attn.W_pack.weight"], 2)
+        else:
             break
-        out[f"layers.{i}.attention.wq.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.q_proj.weight"], params.n_head)
-        out[f"layers.{i}.attention.wk.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.k_proj.weight"], params.n_head)
-        out[f"layers.{i}.attention.wv.weight"] = model[f"model.layers.{i}.self_attn.v_proj.weight"]
+
         out[f"layers.{i}.attention.wo.weight"] = model[f"model.layers.{i}.self_attn.o_proj.weight"]
 
         out[f"layers.{i}.feed_forward.w1.weight"] = model[f"model.layers.{i}.mlp.gate_proj.weight"]
@@ -791,6 +828,7 @@ def lazy_load_torch_file(outer_fp: IO[bytes], path: Path) -> ModelPlus:
 
 
 SAFETENSORS_DATA_TYPES: Dict[str, DataType] = {
+    'BF16': DT_BF16,
     'F16': DT_F16,
     'F32': DT_F32,
     'I32': DT_I32,
diff --git a/examples/alpaca.sh b/examples/alpaca.sh
index aef207f36..8d2bae691 100755
--- a/examples/alpaca.sh
+++ b/examples/alpaca.sh
@@ -7,7 +7,7 @@
 cd `dirname $0`
 cd ..
 
-./main -m ./models/ggml-alpaca-7b-q4.bin \
+./main -m ./models/alpaca.13b.ggmlv3.q8_0.bin \
        --color \
        -f ./prompts/alpaca.txt \
        --ctx_size 2048 \
diff --git a/examples/baby-llama/baby-llama.cpp b/examples/baby-llama/baby-llama.cpp
index 212f54d32..4965881ec 100644
--- a/examples/baby-llama/baby-llama.cpp
+++ b/examples/baby-llama/baby-llama.cpp
@@ -31,6 +31,17 @@ float frand_normal(struct random_normal_distribution * rnd) {
     return ((r < rnd->min) ? (rnd->min) : (r > rnd->max) ? (rnd->max) : r);
 }
 
+void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) {
+    struct ggml_cplan plan = ggml_graph_plan(graph, n_threads);
+
+    if (plan.work_size > 0) {
+        buf.resize(plan.work_size);
+        plan.work_data = buf.data();
+    }
+
+    ggml_graph_compute(graph, &plan);
+}
+
 struct ggml_tensor * randomize_tensor(
         struct ggml_tensor * tensor,
         int ndims,
@@ -1569,6 +1580,8 @@ int main(int argc, char ** argv) {
     int n_tokens = model.hparams.n_ctx;
     int n_vocab  = model.hparams.n_vocab;
 
+    std::vector<uint8_t> work_buffer;
+
     for (int ex=0; ex<n_examples; ++ex) {
         struct ggml_init_params params = {
             /*.mem_size   =*/ compute_size,
@@ -1586,7 +1599,6 @@ int main(int argc, char ** argv) {
         int n_past = 0;
 
         ggml_cgraph gf = {};
-        gf.n_threads = 1;
 
         get_example_targets_batch(ctx0, 64*ex+0,  tokens_input, targets);
 
@@ -1595,7 +1607,7 @@ int main(int argc, char ** argv) {
         struct ggml_tensor * e = square_error_loss(ctx0, targets, logits);
 
         ggml_build_forward_expand(&gf, e);
-        ggml_graph_compute(ctx0, &gf);
+        ggml_graph_compute_helper(work_buffer, &gf, /*n_threads*/ 1);
 
         float error_before_opt = ggml_get_f32_1d(e, 0);
 
@@ -1611,7 +1623,7 @@ int main(int argc, char ** argv) {
         ggml_opt(ctx0, opt_params_lbfgs, e);
         //
         ggml_build_forward_expand(&gf, e);
-        ggml_graph_compute(ctx0, &gf);
+        ggml_graph_compute_helper(work_buffer, &gf, /*n_threads*/ 1);
 
         float error_after_opt = ggml_get_f32_1d(e, 0);
 
@@ -1659,13 +1671,12 @@ int main(int argc, char ** argv) {
             struct ggml_context * ctx0 = ggml_init(params);
 
             ggml_cgraph gf = {};
-            gf.n_threads = 1;
 
             int n_past = 0;
             struct ggml_tensor * logits = forward(&model, &kv_self, ctx0, &gf, tokens_input, sample_ctx, n_past);
 
             ggml_build_forward_expand(&gf, logits);
-            ggml_graph_compute(ctx0, &gf);
+            ggml_graph_compute_helper(work_buffer, &gf, /*n_threads*/ 1);
 
             struct ggml_tensor * best_samples = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, sample_ctx);
             struct ggml_tensor * probs        = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_vocab, sample_ctx);
@@ -1687,10 +1698,11 @@ int main(int argc, char ** argv) {
     }
 
     print_matrix(model.tok_embeddings);
-
     printf("done\n");
+
     // ggml_free(kv_self.ctx);
     // ggml_free(model_lora.ctx);
     ggml_free(model.ctx);
+
     return 0;
 }
diff --git a/examples/benchmark/benchmark-matmult.cpp b/examples/benchmark/benchmark-matmult.cpp
index 39d15caeb..f7215f43b 100644
--- a/examples/benchmark/benchmark-matmult.cpp
+++ b/examples/benchmark/benchmark-matmult.cpp
@@ -20,6 +20,17 @@
 #pragma warning(disable: 4244 4267) // possible loss of data
 #endif
 
+void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) {
+    struct ggml_cplan plan = ggml_graph_plan(graph, n_threads);
+
+    if (plan.work_size > 0) {
+        buf.resize(plan.work_size);
+        plan.work_data = buf.data();
+    }
+
+    ggml_graph_compute(graph, &plan);
+}
+
 float tensor_sum_elements(const ggml_tensor * tensor) {
     float sum = 0;
     if (tensor->type==GGML_TYPE_F32) {
@@ -159,13 +170,14 @@ int main(int argc, char ** argv)  {
     // printf("Creating compute graph\n");
     struct ggml_cgraph gf = ggml_build_forward(m11xm2);
 
-    gf.n_threads=benchmark_params.n_threads;
-    printf("cgraph->n_threads=%i\n",gf.n_threads);
+    printf("n_threads=%i\n", benchmark_params.n_threads);
 
     TENSOR_DUMP(m11);
     TENSOR_DUMP(m2);
 
-    ggml_graph_compute(ctx, &gf);
+    std::vector<uint8_t> work_buffer;
+
+    ggml_graph_compute_helper(work_buffer, &gf, benchmark_params.n_threads);
 
     TENSOR_DUMP(gf.nodes[0]);
 
@@ -187,7 +199,6 @@ int main(int argc, char ** argv)  {
 
     // printf("Creating compute graph\n");
     struct ggml_cgraph gf31 = ggml_build_forward(q31);
-    gf31.n_threads=benchmark_params.n_threads;
 
     // Set up a second graph computation to make sure we override the CPU cache lines
     // printf("Creating new tensor q12 & Running quantize\n");
@@ -199,8 +210,7 @@ int main(int argc, char ** argv)  {
 
     //printf("Creating compute graph\n");
     struct ggml_cgraph gf32 = ggml_build_forward(q32);
-    gf32.n_threads=benchmark_params.n_threads;
-    printf("cgraph->n_threads=%i\n",gf31.n_threads);
+    printf("n_threads=%i\n", benchmark_params.n_threads);
 
     const int dimx = sizex;
     const int dimy = sizey;
@@ -221,14 +231,15 @@ int main(int argc, char ** argv)  {
 
         long long int start = ggml_time_us();
         //printf("Running ggml_graph_compute\n");
-        ggml_graph_compute(ctx, &gf31);
+        ggml_graph_compute_helper(work_buffer, &gf31, benchmark_params.n_threads);
+
         long long int stop = ggml_time_us();
         long long int usec = stop-start;
         double gflops = (double)(flops_per_matrix)/usec/1000.0;
         gflops_sum += gflops;
         printf("%9i;%8i;%6i;%6i;%6i;%15lli;%18lli;%10.2f\n",
             i,
-            gf31.n_threads,
+            benchmark_params.n_threads,
             sizex, sizey, sizez, flops_per_matrix,
             usec,gflops);
 
@@ -253,7 +264,7 @@ int main(int argc, char ** argv)  {
         }
 
         // Running a different graph computation to make sure we override the CPU cache lines
-        ggml_graph_compute(ctx, &gf32);
+        ggml_graph_compute_helper(work_buffer, &gf32, benchmark_params.n_threads);
     }
     printf("\n");
     printf("Average%78.2f\n",gflops_sum/((double)benchmark_params.n_iterations));
diff --git a/examples/common.cpp b/examples/common.cpp
index 3278a0643..8705127cb 100644
--- a/examples/common.cpp
+++ b/examples/common.cpp
@@ -168,6 +168,18 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
                 break;
             }
             params.n_ctx = std::stoi(argv[i]);
+        } else if (arg == "--rope-freq-base") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.rope_freq_base = std::stof(argv[i]);
+        } else if (arg == "--rope-freq-scale") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.rope_freq_scale = std::stof(argv[i]);
         } else if (arg == "--memory-f32") {
             params.memory_f16 = false;
         } else if (arg == "--top-p") {
@@ -236,6 +248,24 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
                 break;
             }
             params.mirostat_tau = std::stof(argv[i]);
+        } else if (arg == "--cfg-negative-prompt") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.cfg_negative_prompt = argv[i];
+        } else if (arg == "--cfg-scale") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.cfg_scale = std::stof(argv[i]);
+        } else if (arg == "--cfg-smooth-factor") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.cfg_smooth_factor = std::stof(argv[i]);
         } else if (arg == "-b" || arg == "--batch-size") {
             if (++i >= argc) {
                 invalid_param = true;
@@ -418,6 +448,8 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
 
     if (escape_prompt) {
         process_escapes(params.prompt);
+        process_escapes(params.input_prefix);
+        process_escapes(params.input_suffix);
     }
 
     return true;
@@ -468,7 +500,13 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
     fprintf(stderr, "                        modifies the likelihood of token appearing in the completion,\n");
     fprintf(stderr, "                        i.e. `--logit-bias 15043+1` to increase likelihood of token ' Hello',\n");
     fprintf(stderr, "                        or `--logit-bias 15043-1` to decrease likelihood of token ' Hello'\n");
+    fprintf(stderr, "  --cfg-negative-prompt PROMPT \n");
+    fprintf(stderr, "                        negative prompt to use for guidance. (default: empty)\n");
+    fprintf(stderr, "  --cfg-scale N         strength of guidance (default: %f, 1.0 = disable)\n", params.cfg_scale);
+    fprintf(stderr, "  --cfg-smooth-factor N smooth factor between old and new logits (default: %f, 1.0 = no smoothing)\n", params.cfg_smooth_factor);
     fprintf(stderr, "  -c N, --ctx-size N    size of the prompt context (default: %d)\n", params.n_ctx);
+    fprintf(stderr, "  --rope-freq-base N    RoPE base frequency (default: %.1f)\n", params.rope_freq_base);
+    fprintf(stderr, "  --rope-freq-scale N   RoPE frequency scaling factor (default: %g)\n", params.rope_freq_scale);
     fprintf(stderr, "  --ignore-eos          ignore end of stream token and continue generating (implies --logit-bias 2-inf)\n");
     fprintf(stderr, "  --no-penalize-nl      do not penalize newline token\n");
     fprintf(stderr, "  --memory-f32          use f32 instead of f16 for memory key+value (default: disabled)\n");
@@ -534,7 +572,7 @@ std::vector<llama_token> llama_tokenize(struct llama_context * ctx, const std::s
     return res;
 }
 
-std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_params(const gpt_params & params) {
+struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params) {
     auto lparams = llama_context_default_params();
 
     lparams.n_ctx        = params.n_ctx;
@@ -549,6 +587,14 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
     lparams.use_mlock    = params.use_mlock;
     lparams.logits_all   = params.perplexity;
     lparams.embedding    = params.embedding;
+    lparams.rope_freq_base  = params.rope_freq_base;
+    lparams.rope_freq_scale = params.rope_freq_scale;
+
+    return lparams;
+}
+
+std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_params(const gpt_params & params) {
+    auto lparams = llama_context_params_from_gpt_params(params);
 
     llama_model * model  = llama_load_model_from_file(params.model.c_str(), lparams);
     if (model == NULL) {
diff --git a/examples/common.h b/examples/common.h
index 66e567291..f52fef629 100644
--- a/examples/common.h
+++ b/examples/common.h
@@ -31,7 +31,9 @@ struct gpt_params {
     int32_t n_gpu_layers                    = 0;   // number of layers to store in VRAM
     int32_t main_gpu                        = 0;   // the GPU that is used for scratch and small tensors
     float   tensor_split[LLAMA_MAX_DEVICES] = {0}; // how split tensors should be distributed across GPUs
-    bool    low_vram                        = 0;   // if true, reduce VRAM usage at the cost of performance
+    int32_t n_probs                         = 0;   // if greater than 0, output the probabilities of top n_probs tokens.
+    float   rope_freq_base                  = 10000.0f; // RoPE base frequency
+    float   rope_freq_scale                 = 1.0f;     // RoPE frequency scaling factor
 
     // sampling parameters
     std::unordered_map<llama_token, float> logit_bias; // logit bias for specific tokens
@@ -48,6 +50,12 @@ struct gpt_params {
     float   mirostat_tau      = 5.00f; // target entropy
     float   mirostat_eta      = 0.10f; // learning rate
 
+    // Classifier-Free Guidance
+    // https://arxiv.org/abs/2306.17806
+    std::string cfg_negative_prompt;       // string to help guidance
+    float       cfg_scale         = 1.f;   // How strong is guidance
+    float       cfg_smooth_factor = 1.f;   // Smooth factor between old and new logits
+
     std::string model             = "models/7B/ggml-model.bin"; // model path
     std::string model_alias       = "unknown"; // model alias
     std::string prompt            = "";
@@ -59,6 +67,7 @@ struct gpt_params {
     std::string lora_adapter = "";  // lora adapter path
     std::string lora_base    = "";  // base model path for the lora adapter
 
+    bool low_vram          = false;   // if true, reduce VRAM usage at the cost of performance
     bool memory_f16        = true;  // use f16 instead of f32 for memory kv
     bool random_prompt     = false; // do not randomize prompt if none provided
     bool use_color         = false; // use color to distinguish generations and inputs
@@ -98,6 +107,7 @@ std::vector<llama_token> llama_tokenize(struct llama_context * ctx, const std::s
 //
 
 std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_params(const gpt_params & params);
+struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params);
 
 //
 // Console utils
diff --git a/examples/embd-input/README.md b/examples/embd-input/README.md
index 02d028f26..5c4c75ea7 100644
--- a/examples/embd-input/README.md
+++ b/examples/embd-input/README.md
@@ -17,7 +17,7 @@ make
 import torch
 
 bin_path = "../LLaVA-13b-delta-v1-1/pytorch_model-00003-of-00003.bin"
-pth_path = "./examples/embd_input/llava_projection.pth"
+pth_path = "./examples/embd-input/llava_projection.pth"
 
 dic = torch.load(bin_path)
 used_key = ["model.mm_projector.weight","model.mm_projector.bias"]
diff --git a/examples/embd-input/embd-input-lib.cpp b/examples/embd-input/embd-input-lib.cpp
index 37de52ad6..26563821a 100644
--- a/examples/embd-input/embd-input-lib.cpp
+++ b/examples/embd-input/embd-input-lib.cpp
@@ -29,12 +29,12 @@ struct MyModel* create_mymodel(int argc, char ** argv) {
 
     fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT);
 
-    if (params.seed < 0) {
+    if (params.seed == LLAMA_DEFAULT_SEED) {
         params.seed = time(NULL);
     }
     fprintf(stderr, "%s: seed  = %d\n", __func__, params.seed);
 
-    llama_init_backend(params.numa);
+    llama_backend_init(params.numa);
 
     llama_model * model;
     llama_context * ctx;
@@ -210,9 +210,12 @@ llama_token sampling_id(struct MyModel* mymodel) {
 const char * sampling(struct MyModel * mymodel) {
     llama_context * ctx = mymodel->ctx;
     int id = sampling_id(mymodel);
-    std::string ret;
-    if (id == llama_token_eos()) ret = "</s>";
-    else ret = llama_token_to_str(ctx, id);
+    static std::string ret;
+    if (id == llama_token_eos()) {
+        ret = "</s>";
+    } else {
+        ret = llama_token_to_str(ctx, id);
+    }
     eval_id(mymodel, id);
     return ret.c_str();
 }
diff --git a/examples/embd-input/embd-input.h b/examples/embd-input/embd-input.h
index 4fefabd42..efb5ba5e2 100644
--- a/examples/embd-input/embd-input.h
+++ b/examples/embd-input/embd-input.h
@@ -5,7 +5,6 @@
 #include "llama.h"
 #include "build-info.h"
 
-
 extern "C" {
 
 typedef struct MyModel {
@@ -14,14 +13,13 @@ typedef struct MyModel {
     int n_past = 0;
 } MyModel;
 
-
 struct MyModel* create_mymodel(int argc, char ** argv);
 
 bool eval_float(void* model, float* input, int N);
 bool eval_tokens(void* model, std::vector<llama_token> tokens);
 bool eval_id(struct MyModel* mymodel, int id);
 bool eval_string(struct MyModel* mymodel, const char* str);
-const char* sampling(struct MyModel* mymodel);
+const char * sampling(struct MyModel* mymodel);
 llama_token sampling_id(struct MyModel* mymodel);
 void free_mymodel(struct MyModel* mymodel);
 
diff --git a/examples/embd-input/llava.py b/examples/embd-input/llava.py
index 2f20cb722..bcbdd2bed 100644
--- a/examples/embd-input/llava.py
+++ b/examples/embd-input/llava.py
@@ -59,7 +59,7 @@ if __name__=="__main__":
     # Also here can use pytorch_model-00003-of-00003.bin directly.
     a.load_projection(os.path.join(
         os.path.dirname(__file__) ,
-        "llava_projetion.pth"))
+        "llava_projection.pth"))
     respose = a.chat_with_image(
         Image.open("./media/llama1-logo.png").convert('RGB'),
         "what is the text in the picture?")
diff --git a/examples/embedding/embedding.cpp b/examples/embedding/embedding.cpp
index 2b7eb39c5..5192d6df5 100644
--- a/examples/embedding/embedding.cpp
+++ b/examples/embedding/embedding.cpp
@@ -18,7 +18,7 @@ int main(int argc, char ** argv) {
     params.embedding = true;
 
     if (params.n_ctx > 2048) {
-        fprintf(stderr, "%s: warning: model does not support context sizes greater than 2048 tokens (%d specified);"
+        fprintf(stderr, "%s: warning: model might not support context sizes greater than 2048 tokens (%d specified);"
                 "expect poor results\n", __func__, params.n_ctx);
     }
 
@@ -35,7 +35,7 @@ int main(int argc, char ** argv) {
         params.prompt = gpt_random_prompt(rng);
     }
 
-    llama_init_backend(params.numa);
+    llama_backend_init(params.numa);
 
     llama_model * model;
     llama_context * ctx;
@@ -93,5 +93,7 @@ int main(int argc, char ** argv) {
     llama_free(ctx);
     llama_free_model(model);
 
+    llama_backend_free();
+
     return 0;
 }
diff --git a/examples/main/main.cpp b/examples/main/main.cpp
index 3a171925b..bcbcf12b0 100644
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@@ -84,9 +84,17 @@ int main(int argc, char ** argv) {
         return 0;
     }
 
+    if (params.rope_freq_base != 10000.0) {
+        fprintf(stderr, "%s: warning: changing RoPE frequency base to %g (default 10000.0)\n", __func__, params.rope_freq_base);
+    }
+
+    if (params.rope_freq_scale != 1.0) {
+        fprintf(stderr, "%s: warning: scaling RoPE frequency by %g (default 1.0)\n", __func__, params.rope_freq_scale);
+    }
+
     if (params.n_ctx > 2048) {
-        fprintf(stderr, "%s: warning: model does not support context sizes greater than 2048 tokens (%d specified);"
-                "expect poor results\n", __func__, params.n_ctx);
+        fprintf(stderr, "%s: warning: base model only supports context sizes no greater than 2048 tokens (%d specified);"
+                " you are on your own\n", __func__, params.n_ctx);
     } else if (params.n_ctx < 8) {
         fprintf(stderr, "%s: warning: minimum context size is 8, using minimum size.\n", __func__);
         params.n_ctx = 8;
@@ -105,14 +113,20 @@ int main(int argc, char ** argv) {
         params.prompt = gpt_random_prompt(rng);
     }
 
-    llama_init_backend(params.numa);
+    llama_backend_init(params.numa);
 
     llama_model * model;
     llama_context * ctx;
+    llama_context * ctx_guidance = NULL;
     g_ctx = &ctx;
 
     // load the model and apply lora adapter, if any
     std::tie(model, ctx) = llama_init_from_gpt_params(params);
+    if (params.cfg_scale > 1.f) {
+        struct llama_context_params lparams = llama_context_params_from_gpt_params(params);
+        ctx_guidance = llama_new_context_with_model(model, lparams);
+    }
+
     if (model == NULL) {
         fprintf(stderr, "%s: error: unable to load model\n", __func__);
         return 1;
@@ -183,15 +197,28 @@ int main(int argc, char ** argv) {
     // tokenize the prompt
     std::vector<llama_token> embd_inp;
 
-    if (params.interactive_first || params.instruct || !params.prompt.empty() || session_tokens.empty()) {
-        // Add a space in front of the first character to match OG llama tokenizer behavior
-        params.prompt.insert(0, 1, ' ');
+    // Add a space in front of the first character to match OG llama tokenizer behavior
+    params.prompt.insert(0, 1, ' ');
 
+    if (params.interactive_first || params.instruct || !params.prompt.empty() || session_tokens.empty()) {
         embd_inp = ::llama_tokenize(ctx, params.prompt, true);
     } else {
         embd_inp = session_tokens;
     }
 
+    // Tokenize negative prompt
+    std::vector<llama_token> guidance_inp;
+    int guidance_offset = 0;
+    int original_prompt_len = 0;
+    if (ctx_guidance) {
+        params.cfg_negative_prompt.insert(0, 1, ' ');
+        guidance_inp = ::llama_tokenize(ctx_guidance, params.cfg_negative_prompt, true);
+
+        std::vector<llama_token> original_inp = ::llama_tokenize(ctx, params.prompt, true);
+        original_prompt_len = original_inp.size();
+        guidance_offset = (int)guidance_inp.size() - original_prompt_len;
+    }
+
     const int n_ctx = llama_n_ctx(ctx);
 
     if ((int) embd_inp.size() > n_ctx - 4) {
@@ -258,6 +285,16 @@ int main(int argc, char ** argv) {
         for (int i = 0; i < (int) embd_inp.size(); i++) {
             fprintf(stderr, "%6d -> '%s'\n", embd_inp[i], llama_token_to_str(ctx, embd_inp[i]));
         }
+
+        if (ctx_guidance) {
+            fprintf(stderr, "\n");
+            fprintf(stderr, "%s: negative prompt: '%s'\n", __func__, params.cfg_negative_prompt.c_str());
+            fprintf(stderr, "%s: number of tokens in negative prompt = %zu\n", __func__, guidance_inp.size());
+            for (int i = 0; i < (int) guidance_inp.size(); i++) {
+                fprintf(stderr, "%6d -> '%s'\n", guidance_inp[i], llama_token_to_str(ctx, guidance_inp[i]));
+            }
+        }
+
         if (params.n_keep > 0) {
         fprintf(stderr, "%s: static prompt based on n_keep: '", __func__);
             for (int i = 0; i < params.n_keep; i++) {
@@ -334,11 +371,13 @@ int main(int argc, char ** argv) {
     int n_remain           = params.n_predict;
     int n_consumed         = 0;
     int n_session_consumed = 0;
+    int n_past_guidance    = 0;
 
     // the first thing we will do is to output the prompt, so set color accordingly
     console_set_color(con_st, CONSOLE_COLOR_PROMPT);
 
     std::vector<llama_token> embd;
+    std::vector<llama_token> embd_guidance;
 
     // do one empty run to warm up the model
     {
@@ -367,11 +406,12 @@ int main(int argc, char ** argv) {
             // if we run out of context:
             // - take the n_keep first tokens from the original prompt (via n_past)
             // - take half of the last (n_ctx - n_keep) tokens and recompute the logits in batches
-            if (n_past + (int) embd.size() > n_ctx) {
+            if (n_past + (int) embd.size() + std::max<int>(0, guidance_offset) > n_ctx) {
                 const int n_left = n_past - params.n_keep;
 
                 // always keep the first token - BOS
                 n_past = std::max(1, params.n_keep);
+                n_past_guidance = std::max(1, params.n_keep + guidance_offset);
 
                 // insert n_left/2 tokens at the start of embd from last_n_tokens
                 embd.insert(embd.begin(), last_n_tokens.begin() + n_ctx - n_left/2 - embd.size(), last_n_tokens.end() - embd.size());
@@ -412,6 +452,48 @@ int main(int argc, char ** argv) {
 
             // evaluate tokens in batches
             // embd is typically prepared beforehand to fit within a batch, but not always
+
+            if (ctx_guidance) {
+                int input_size = 0;
+                llama_token* input_buf = NULL;
+
+                if (n_past_guidance < (int) guidance_inp.size()) {
+                    // Guidance context should have the same data with these modifications:
+                    //
+                    // * Replace the initial prompt
+                    // * Shift everything by guidance_offset
+                    embd_guidance = guidance_inp;
+                    if (embd.begin() + original_prompt_len < embd.end()) {
+                        embd_guidance.insert(
+                            embd_guidance.end(),
+                            embd.begin() + original_prompt_len,
+                            embd.end()
+                        );
+                    }
+
+                    input_buf = embd_guidance.data();
+                    input_size = embd_guidance.size();
+                    //fprintf(stderr, "\n---------------------\n");
+                    //for (int i = 0; i < (int) embd_guidance.size(); i++) {
+                        //fprintf(stderr, "%s", llama_token_to_str(ctx, embd_guidance[i]));
+                    //}
+                    //fprintf(stderr, "\n---------------------\n");
+                } else {
+                    input_buf = embd.data();
+                    input_size = embd.size();
+                }
+
+                for (int i = 0; i < input_size; i += params.n_batch) {
+                    int n_eval = std::min(input_size - i, params.n_batch);
+                    if (llama_eval(ctx_guidance, input_buf + i, n_eval, n_past_guidance, params.n_threads)) {
+                        fprintf(stderr, "%s : failed to eval\n", __func__);
+                        return 1;
+                    }
+
+                    n_past_guidance += n_eval;
+                }
+            }
+
             for (int i = 0; i < (int) embd.size(); i += params.n_batch) {
                 int n_eval = (int) embd.size() - i;
                 if (n_eval > params.n_batch) {
@@ -431,6 +513,7 @@ int main(int argc, char ** argv) {
         }
 
         embd.clear();
+        embd_guidance.clear();
 
         if ((int) embd_inp.size() <= n_consumed && !is_interacting) {
             // out of user input, sample next token
@@ -473,6 +556,10 @@ int main(int argc, char ** argv) {
 
                 llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
 
+                if (ctx_guidance) {
+                    llama_sample_classifier_free_guidance(ctx, &candidates_p, ctx_guidance, params.cfg_scale, params.cfg_smooth_factor);
+                }
+
                 // Apply penalties
                 float nl_logit = logits[llama_token_nl()];
                 auto last_n_repeat = std::min(std::min((int)last_n_tokens.size(), repeat_last_n), n_ctx);
@@ -668,8 +755,11 @@ int main(int argc, char ** argv) {
     }
 
     llama_print_timings(ctx);
+    if (ctx_guidance) { llama_free(ctx_guidance); }
     llama_free(ctx);
     llama_free_model(model);
 
+    llama_backend_free();
+
     return 0;
 }
diff --git a/examples/metal/metal.cpp b/examples/metal/metal.cpp
index cdfe4bfe9..7438defde 100644
--- a/examples/metal/metal.cpp
+++ b/examples/metal/metal.cpp
@@ -35,10 +35,9 @@ int main(int argc, char ** argv) {
     struct ggml_context * ctx_eval = NULL;
 
     struct ggml_cgraph gf = ggml_graph_import(fname_cgraph, &ctx_data, &ctx_eval);
-    gf.n_threads = 1;
 
     // this allocates all Metal resources and memory buffers
-    auto * ctx_metal = ggml_metal_init();
+    auto * ctx_metal = ggml_metal_init(1);
 
     const size_t max_size_data = ggml_get_max_tensor_size(ctx_data);
     const size_t max_size_eval = ggml_get_max_tensor_size(ctx_eval);
diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp
index dd54ed3c4..7e120ff12 100644
--- a/examples/perplexity/perplexity.cpp
+++ b/examples/perplexity/perplexity.cpp
@@ -130,7 +130,7 @@ int main(int argc, char ** argv) {
     params.n_batch = std::min(params.n_batch, params.n_ctx);
 
     if (params.n_ctx > 2048) {
-        fprintf(stderr, "%s: warning: model does not support context sizes greater than 2048 tokens (%d specified);"
+        fprintf(stderr, "%s: warning: model might not support context sizes greater than 2048 tokens (%d specified);"
                 "expect poor results\n", __func__, params.n_ctx);
     }
 
@@ -147,7 +147,7 @@ int main(int argc, char ** argv) {
         params.prompt = gpt_random_prompt(rng);
     }
 
-    llama_init_backend(params.numa);
+    llama_backend_init(params.numa);
 
     llama_model * model;
     llama_context * ctx;
@@ -172,5 +172,7 @@ int main(int argc, char ** argv) {
     llama_free(ctx);
     llama_free_model(model);
 
+    llama_backend_free();
+
     return 0;
 }
diff --git a/examples/quantize-stats/quantize-stats.cpp b/examples/quantize-stats/quantize-stats.cpp
index 9cea472de..6aa06ec8f 100644
--- a/examples/quantize-stats/quantize-stats.cpp
+++ b/examples/quantize-stats/quantize-stats.cpp
@@ -147,7 +147,7 @@ void test_roundtrip_on_chunk(
         const ggml_tensor * layer,
         int64_t offset,
         int64_t chunk_size,
-        const quantize_fns_t & qfns,
+        const ggml_type_traits_t & qfns,
         bool use_reference,
         float * input_scratch,
         char * quantized_scratch,
@@ -163,11 +163,11 @@ void test_roundtrip_on_chunk(
     }
 
     if (use_reference) {
-        qfns.quantize_row_q_reference(input_scratch, quantized_scratch, chunk_size);
+        qfns.from_float_reference(input_scratch, quantized_scratch, chunk_size);
     } else {
-        qfns.quantize_row_q(input_scratch, quantized_scratch, chunk_size);
+        qfns.from_float(input_scratch, quantized_scratch, chunk_size);
     }
-    qfns.dequantize_row_q(quantized_scratch, output_scratch, chunk_size);
+    qfns.to_float(quantized_scratch, output_scratch, chunk_size);
 
     update_error_stats(chunk_size, input_scratch, output_scratch, stats);
 }
@@ -177,7 +177,7 @@ void test_roundtrip_on_chunk(
 void test_roundtrip_on_layer(
         std::string & name,
         bool print_layer_stats,
-        const quantize_fns_t & qfns,
+        const ggml_type_traits_t & qfns,
         bool use_reference,
         const ggml_tensor * layer,
         std::vector<float> & input_scratch,
@@ -388,8 +388,8 @@ int main(int argc, char ** argv) {
         if (!params.include_types.empty() && std::find(params.include_types.begin(), params.include_types.end(), i) == params.include_types.end()) {
             continue;
         }
-        quantize_fns_t qfns = ggml_internal_get_quantize_fn(i);
-        if (qfns.quantize_row_q && qfns.dequantize_row_q) {
+        ggml_type_traits_t qfns = ggml_internal_get_type_traits(type);
+        if (qfns.from_float && qfns.to_float) {
             if (params.verbose) {
                 printf("testing %s ...\n",  ggml_type_name(type));
             }
diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp
index 1eb0f75d6..797d2f0c5 100644
--- a/examples/quantize/quantize.cpp
+++ b/examples/quantize/quantize.cpp
@@ -180,7 +180,7 @@ int main(int argc, char ** argv) {
         usage(argv[0]);
     }
 
-    llama_init_backend(false);
+    llama_backend_init(false);
 
     // parse command line arguments
     const std::string fname_inp = argv[arg_idx];
@@ -257,5 +257,7 @@ int main(int argc, char ** argv) {
         printf("%s:    total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us)/1000.0);
     }
 
+    llama_backend_free();
+
     return 0;
 }
diff --git a/examples/server/README.md b/examples/server/README.md
index ba4b2fec9..e5ca8269b 100644
--- a/examples/server/README.md
+++ b/examples/server/README.md
@@ -1,13 +1,13 @@
 # llama.cpp/example/server
 
-This example demonstrates a simple HTTP API server to interact with llama.cpp.
+This example demonstrates a simple HTTP API server and a simple web front end to interact with llama.cpp.
 
 Command line options:
 
 -   `--threads N`, `-t N`: Set the number of threads to use during computation.
 -   `-m FNAME`, `--model FNAME`: Specify the path to the LLaMA model file (e.g., `models/7B/ggml-model.bin`).
 -   `-m ALIAS`, `--alias ALIAS`: Set an alias for the model. The alias will be returned in API responses.
--   `-c N`, `--ctx-size N`: Set the size of the prompt context. The default is 512, but LLaMA models were built with a context of 2048, which will provide better results for longer input/inference.
+-   `-c N`, `--ctx-size N`: Set the size of the prompt context. The default is 512, but LLaMA models were built with a context of 2048, which will provide better results for longer input/inference. The size may differ in other models, for example, baichuan models were build with a context of 4096.
 -   `-ngl N`, `--n-gpu-layers N`: When compiled with appropriate support (currently CLBlast or cuBLAS), this option allows offloading some layers to the GPU for computation. Generally results in increased performance.
 -   `-mg i, --main-gpu i`: When using multiple GPUs this option controls which GPU is used for small tensors for which the overhead of splitting the computation across all GPUs is not worthwhile. The GPU in question will use slightly more VRAM to store a scratch buffer for temporary results. By default GPU 0 is used. Requires cuBLAS.
 -   `-ts SPLIT, --tensor-split SPLIT`: When using multiple GPUs this option controls how large tensors should be split across all GPUs. `SPLIT` is a comma-separated list of non-negative values that assigns the proportion of data that each GPU should get in order. For example, "3,2" will assign 60% of the data to GPU 0 and 40% to GPU 1. By default the data is split in proportion to VRAM but this may not be optimal for performance. Requires cuBLAS.
@@ -21,24 +21,22 @@ Command line options:
 -   `-to N`, `--timeout N`: Server read/write timeout in seconds. Default `600`.
 -   `--host`: Set the hostname or ip address to listen. Default `127.0.0.1`.
 -   `--port`: Set the port to listen. Default: `8080`.
+-   `--path`: path from which to serve static files (default examples/server/public)
 -   `--embedding`: Enable embedding extraction, Default: disabled.
 
 ## Build
 
-Build llama.cpp with server from repository root with either make or CMake.
+server is build alongside everything else from the root of the project
 
 - Using `make`:
 
   ```bash
-  LLAMA_BUILD_SERVER=1 make
+  make
   ```
 
 - Using `CMake`:
 
   ```bash
-  mkdir build-server
-  cd build-server
-  cmake -DLLAMA_BUILD_SERVER=ON ..
   cmake --build . --config Release
   ```
 
@@ -59,7 +57,7 @@ server.exe -m models\7B\ggml-model.bin -c 2048
 ```
 
 The above command will start a server that by default listens on `127.0.0.1:8080`.
-You can consume the endpoints with Postman or NodeJS with axios library.
+You can consume the endpoints with Postman or NodeJS with axios library. You can visit the web front end at the same url.
 
 ## Testing with CURL
 
@@ -68,6 +66,7 @@ Using [curl](https://curl.se/). On Windows `curl.exe` should be available in the
 ```sh
 curl --request POST \
     --url http://localhost:8080/completion \
+    --header "Content-Type: application/json" \
     --data '{"prompt": "Building a website can be done in 10 simple steps:","n_predict": 128}'
 ```
 
@@ -190,3 +189,49 @@ Run with bash:
 ```sh
 bash chat.sh
 ```
+
+### API like OAI
+
+API example using Python Flask: [api_like_OAI.py](api_like_OAI.py)
+This example must be used with server.cpp
+
+```sh
+python api_like_OAI.py
+```
+
+After running the API server, you can use it in Python by setting the API base URL.
+```python
+openai.api_base = "http://<Your api-server IP>:port"
+```
+
+Then you can utilize llama.cpp as an OpenAI's **chat.completion** or **text_completion** API
+
+### Extending or building alternative Web Front End
+
+The default location for the static files is `examples/server/public`. You can extend the front end by running the server binary with `--path` set to `./your-directory` and importing `/completion.js` to get access to the llamaComplete() method.
+
+Read the documentation in `/completion.js` to see convenient ways to access llama.
+
+A simple example is below:
+
+```html
+<html>
+  <body>
+    <pre>
+      <script type="module">
+        import { llama } from '/completion.js'
+
+        const prompt = `### Instruction:
+Write dad jokes, each one paragraph.
+You can use html formatting if needed.
+
+### Response:`
+
+        for await (const chunk of llama(prompt)) {
+          document.write(chunk.data.content)
+        }
+      </script>
+    </pre>
+  </body>
+</html>
+```
diff --git a/examples/server/api_like_OAI.py b/examples/server/api_like_OAI.py
new file mode 100755
index 000000000..aa325a03e
--- /dev/null
+++ b/examples/server/api_like_OAI.py
@@ -0,0 +1,219 @@
+import argparse
+from flask import Flask, jsonify, request, Response
+import urllib.parse
+import requests
+import time
+import json
+
+
+app = Flask(__name__)
+
+parser = argparse.ArgumentParser(description="An example of using server.cpp with a similar API to OAI. It must be used together with server.cpp.")
+parser.add_argument("--chat-prompt", type=str, help="the top prompt in chat completions(default: 'A chat between a curious user and an artificial intelligence assistant. The assistant follows the given rules no matter what.\\n')", default='A chat between a curious user and an artificial intelligence assistant. The assistant follows the given rules no matter what.\\n')
+parser.add_argument("--user-name", type=str, help="USER name in chat completions(default: '\\nUSER: ')", default="\\nUSER: ")
+parser.add_argument("--ai-name", type=str, help="ASSISTANT name in chat completions(default: '\\nASSISTANT: ')", default="\\nASSISTANT: ")
+parser.add_argument("--system-name", type=str, help="SYSTEM name in chat completions(default: '\\nASSISTANT's RULE: ')", default="\\nASSISTANT's RULE: ")
+parser.add_argument("--stop", type=str, help="the end of response in chat completions(default: '</s>')", default="</s>")
+parser.add_argument("--llama-api", type=str, help="Set the address of server.cpp in llama.cpp(default: http://127.0.0.1:8080)", default='http://127.0.0.1:8080')
+parser.add_argument("--api-key", type=str, help="Set the api key to allow only few user(default: NULL)", default="")
+parser.add_argument("--host", type=str, help="Set the ip address to listen.(default: 127.0.0.1)", default='127.0.0.1')
+parser.add_argument("--port", type=int, help="Set the port to listen.(default: 8081)", default=8081)
+
+args = parser.parse_args()
+
+def is_present(json, key):
+    try:
+        buf = json[key]
+    except KeyError:
+        return False
+    return True
+
+
+
+#convert chat to prompt
+def convert_chat(messages):
+    prompt = "" + args.chat_prompt.replace("\\n", "\n")
+
+    system_n = args.system_name.replace("\\n", "\n")
+    user_n = args.user_name.replace("\\n", "\n")
+    ai_n = args.ai_name.replace("\\n", "\n")
+    stop = args.stop.replace("\\n", "\n")
+
+
+    for line in messages:
+        if (line["role"] == "system"):
+            prompt += f"{system_n}{line['content']}"
+        if (line["role"] == "user"):
+            prompt += f"{user_n}{line['content']}"
+        if (line["role"] == "assistant"):
+            prompt += f"{ai_n}{line['content']}{stop}"
+    prompt += ai_n.rstrip()
+
+    return prompt
+
+def make_postData(body, chat=False, stream=False):
+    postData = {}
+    if (chat):
+        postData["prompt"] = convert_chat(body["messages"])
+    else:
+        postData["prompt"] = body["prompt"]
+    if(is_present(body, "temperature")): postData["temperature"] = body["temperature"]
+    if(is_present(body, "top_k")): postData["top_k"] = body["top_k"]
+    if(is_present(body, "top_p")): postData["top_p"] = body["top_p"]
+    if(is_present(body, "max_tokens")): postData["n_predict"] = body["max_tokens"]
+    if(is_present(body, "presence_penalty")): postData["presence_penalty"] = body["presence_penalty"]
+    if(is_present(body, "frequency_penalty")): postData["frequency_penalty"] = body["frequency_penalty"]
+    if(is_present(body, "repeat_penalty")): postData["repeat_penalty"] = body["repeat_penalty"]
+    if(is_present(body, "mirostat")): postData["mirostat"] = body["mirostat"]
+    if(is_present(body, "mirostat_tau")): postData["mirostat_tau"] = body["mirostat_tau"]
+    if(is_present(body, "mirostat_eta")): postData["mirostat_eta"] = body["mirostat_eta"]
+    if(is_present(body, "seed")): postData["seed"] = body["seed"]
+    if(is_present(body, "logit_bias")): postData["logit_bias"] = [[int(token), body["logit_bias"][token]] for token in body["logit_bias"].keys()]
+    if (args.stop != ""):
+        postData["stop"] = [args.stop]
+    else:
+        postData["stop"] = []
+    if(is_present(body, "stop")): postData["stop"] += body["stop"]
+    postData["n_keep"] = -1
+    postData["stream"] = stream
+
+    return postData
+
+def make_resData(data, chat=False, promptToken=[]):
+    resData = {
+        "id": "chatcmpl" if (chat) else "cmpl",
+        "object": "chat.completion" if (chat) else "text_completion",
+        "created": int(time.time()),
+        "truncated": data["truncated"],
+        "model": "LLaMA_CPP",
+        "usage": {
+            "prompt_tokens": data["tokens_evaluated"],
+            "completion_tokens": data["tokens_predicted"],
+            "total_tokens": data["tokens_evaluated"] + data["tokens_predicted"]
+        }
+    }
+    if (len(promptToken) != 0):
+        resData["promptToken"] = promptToken
+    if (chat):
+        #only one choice is supported
+        resData["choices"] = [{
+            "index": 0,
+            "message": {
+                "role": "assistant",
+                "content": data["content"],
+            },
+            "finish_reason": "stop" if (data["stopped_eos"] or data["stopped_word"]) else "length"
+        }]
+    else:
+        #only one choice is supported
+        resData["choices"] = [{
+            "text": data["content"],
+            "index": 0,
+            "logprobs": None,
+            "finish_reason": "stop" if (data["stopped_eos"] or data["stopped_word"]) else "length"
+        }]
+    return resData
+
+def make_resData_stream(data, chat=False, time_now = 0, start=False):
+    resData = {
+        "id": "chatcmpl" if (chat) else "cmpl",
+        "object": "chat.completion.chunk" if (chat) else "text_completion.chunk",
+        "created": time_now,
+        "model": "LLaMA_CPP",
+        "choices": [
+            {
+                "finish_reason": None,
+                "index": 0
+            }
+        ]
+    }
+    if (chat):
+        if (start):
+            resData["choices"][0]["delta"] =  {
+                "role": "assistant"
+            }
+        else:
+            resData["choices"][0]["delta"] =  {
+                "content": data["content"]
+            }
+            if (data["stop"]):
+                resData["choices"][0]["finish_reason"] = "stop" if (data["stopped_eos"] or data["stopped_word"]) else "length"
+    else:
+        resData["choices"][0]["text"] = data["content"]
+        if (data["stop"]):
+            resData["choices"][0]["finish_reason"] = "stop" if (data["stopped_eos"] or data["stopped_word"]) else "length"
+
+    return resData
+
+
+@app.route('/chat/completions', methods=['POST'])
+@app.route('/v1/chat/completions', methods=['POST'])
+def chat_completions():
+    if (args.api_key != "" and request.headers["Authorization"].split()[1] != args.api_key):
+        return Response(status=403)
+    body = request.get_json()
+    stream = False
+    tokenize = False
+    if(is_present(body, "stream")): stream = body["stream"]
+    if(is_present(body, "tokenize")): tokenize = body["tokenize"]
+    postData = make_postData(body, chat=True, stream=stream)
+
+    promptToken = []
+    if (tokenize):
+        tokenData = requests.request("POST", urllib.parse.urljoin(args.llama_api, "/tokenize"), data=json.dumps({"content": postData["prompt"]})).json()
+        promptToken = tokenData["tokens"]
+
+    if (not stream):
+        data = requests.request("POST", urllib.parse.urljoin(args.llama_api, "/completion"), data=json.dumps(postData))
+        print(data.json())
+        resData = make_resData(data.json(), chat=True, promptToken=promptToken)
+        return jsonify(resData)
+    else:
+        def generate():
+            data = requests.request("POST", urllib.parse.urljoin(args.llama_api, "/completion"), data=json.dumps(postData), stream=True)
+            time_now = int(time.time())
+            resData = make_resData_stream({}, chat=True, time_now=time_now, start=True)
+            yield 'data: {}\n'.format(json.dumps(resData))
+            for line in data.iter_lines():
+                if line:
+                    decoded_line = line.decode('utf-8')
+                    resData = make_resData_stream(json.loads(decoded_line[6:]), chat=True, time_now=time_now)
+                    yield 'data: {}\n'.format(json.dumps(resData))
+        return Response(generate(), mimetype='text/event-stream')
+
+
+@app.route('/completions', methods=['POST'])
+@app.route('/v1/completions', methods=['POST'])
+def completion():
+    if (args.api_key != "" and request.headers["Authorization"].split()[1] != args.api_key):
+        return Response(status=403)
+    body = request.get_json()
+    stream = False
+    tokenize = False
+    if(is_present(body, "stream")): stream = body["stream"]
+    if(is_present(body, "tokenize")): tokenize = body["tokenize"]
+    postData = make_postData(body, chat=False, stream=stream)
+
+    promptToken = []
+    if (tokenize):
+        tokenData = requests.request("POST", urllib.parse.urljoin(args.llama_api, "/tokenize"), data=json.dumps({"content": postData["prompt"]})).json()
+        promptToken = tokenData["tokens"]
+
+    if (not stream):
+        data = requests.request("POST", urllib.parse.urljoin(args.llama_api, "/completion"), data=json.dumps(postData))
+        print(data.json())
+        resData = make_resData(data.json(), chat=False, promptToken=promptToken)
+        return jsonify(resData)
+    else:
+        def generate():
+            data = requests.request("POST", urllib.parse.urljoin(args.llama_api, "/completion"), data=json.dumps(postData), stream=True)
+            time_now = int(time.time())
+            for line in data.iter_lines():
+                if line:
+                    decoded_line = line.decode('utf-8')
+                    resData = make_resData_stream(json.loads(decoded_line[6:]), chat=False, time_now=time_now)
+                    yield 'data: {}\n'.format(json.dumps(resData))
+        return Response(generate(), mimetype='text/event-stream')
+
+if __name__ == '__main__':
+    app.run(args.host, port=args.port)
diff --git a/examples/server/chat.sh b/examples/server/chat.sh
index a89f8e908..014360121 100644
--- a/examples/server/chat.sh
+++ b/examples/server/chat.sh
@@ -32,6 +32,7 @@ tokenize() {
         --silent \
         --request POST \
         --url "${API_URL}/tokenize" \
+        --header "Content-Type: application/json" \
         --data-raw "$(jq -ns --arg content "$1" '{content:$content}')" \
     | jq '.tokens[]'
 }
@@ -64,6 +65,7 @@ chat_completion() {
         --no-buffer \
         --request POST \
         --url "${API_URL}/completion" \
+        --header "Content-Type: application/json" \
         --data-raw "${DATA}")
 
     printf "\n"
diff --git a/examples/server/completion.js.hpp b/examples/server/completion.js.hpp
new file mode 100644
index 000000000..f399fb19a
--- /dev/null
+++ b/examples/server/completion.js.hpp
@@ -0,0 +1,375 @@
+unsigned char completion_js[] = {
+  0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x44,
+  0x65, 0x66, 0x61, 0x75, 0x6c, 0x74, 0x73, 0x20, 0x3d, 0x20, 0x7b, 0x0a,
+  0x20, 0x20, 0x73, 0x74, 0x72, 0x65, 0x61, 0x6d, 0x3a, 0x20, 0x74, 0x72,
+  0x75, 0x65, 0x2c, 0x0a, 0x20, 0x20, 0x6e, 0x5f, 0x70, 0x72, 0x65, 0x64,
+  0x69, 0x63, 0x74, 0x3a, 0x20, 0x35, 0x30, 0x30, 0x2c, 0x0a, 0x20, 0x20,
+  0x74, 0x65, 0x6d, 0x70, 0x65, 0x72, 0x61, 0x74, 0x75, 0x72, 0x65, 0x3a,
+  0x20, 0x30, 0x2e, 0x32, 0x2c, 0x0a, 0x20, 0x20, 0x73, 0x74, 0x6f, 0x70,
+  0x3a, 0x20, 0x5b, 0x22, 0x3c, 0x2f, 0x73, 0x3e, 0x22, 0x5d, 0x0a, 0x7d,
+  0x3b, 0x0a, 0x0a, 0x6c, 0x65, 0x74, 0x20, 0x67, 0x65, 0x6e, 0x65, 0x72,
+  0x61, 0x74, 0x69, 0x6f, 0x6e, 0x5f, 0x73, 0x65, 0x74, 0x74, 0x69, 0x6e,
+  0x67, 0x73, 0x20, 0x3d, 0x20, 0x6e, 0x75, 0x6c, 0x6c, 0x3b, 0x0a, 0x0a,
+  0x0a, 0x2f, 0x2f, 0x20, 0x43, 0x6f, 0x6d, 0x70, 0x6c, 0x65, 0x74, 0x65,
+  0x73, 0x20, 0x74, 0x68, 0x65, 0x20, 0x70, 0x72, 0x6f, 0x6d, 0x70, 0x74,
+  0x20, 0x61, 0x73, 0x20, 0x61, 0x20, 0x67, 0x65, 0x6e, 0x65, 0x72, 0x61,
+  0x74, 0x6f, 0x72, 0x2e, 0x20, 0x52, 0x65, 0x63, 0x6f, 0x6d, 0x6d, 0x65,
+  0x6e, 0x64, 0x65, 0x64, 0x20, 0x66, 0x6f, 0x72, 0x20, 0x6d, 0x6f, 0x73,
+  0x74, 0x20, 0x75, 0x73, 0x65, 0x20, 0x63, 0x61, 0x73, 0x65, 0x73, 0x2e,
+  0x0a, 0x2f, 0x2f, 0x0a, 0x2f, 0x2f, 0x20, 0x45, 0x78, 0x61, 0x6d, 0x70,
+  0x6c, 0x65, 0x3a, 0x0a, 0x2f, 0x2f, 0x0a, 0x2f, 0x2f, 0x20, 0x20, 0x20,
+  0x20, 0x69, 0x6d, 0x70, 0x6f, 0x72, 0x74, 0x20, 0x7b, 0x20, 0x6c, 0x6c,
+  0x61, 0x6d, 0x61, 0x20, 0x7d, 0x20, 0x66, 0x72, 0x6f, 0x6d, 0x20, 0x27,
+  0x2f, 0x63, 0x6f, 0x6d, 0x70, 0x6c, 0x65, 0x74, 0x69, 0x6f, 0x6e, 0x2e,
+  0x6a, 0x73, 0x27, 0x0a, 0x2f, 0x2f, 0x0a, 0x2f, 0x2f, 0x20, 0x20, 0x20,
+  0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x72, 0x65, 0x71, 0x75, 0x65,
+  0x73, 0x74, 0x20, 0x3d, 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x28, 0x22,
+  0x54, 0x65, 0x6c, 0x6c, 0x20, 0x6d, 0x65, 0x20, 0x61, 0x20, 0x6a, 0x6f,
+  0x6b, 0x65, 0x22, 0x2c, 0x20, 0x7b, 0x6e, 0x5f, 0x70, 0x72, 0x65, 0x64,
+  0x69, 0x63, 0x74, 0x3a, 0x20, 0x38, 0x30, 0x30, 0x7d, 0x29, 0x0a, 0x2f,
+  0x2f, 0x20, 0x20, 0x20, 0x20, 0x66, 0x6f, 0x72, 0x20, 0x61, 0x77, 0x61,
+  0x69, 0x74, 0x20, 0x28, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x63, 0x68,
+  0x75, 0x6e, 0x6b, 0x20, 0x6f, 0x66, 0x20, 0x72, 0x65, 0x71, 0x75, 0x65,
+  0x73, 0x74, 0x29, 0x20, 0x7b, 0x0a, 0x2f, 0x2f, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x64, 0x6f, 0x63, 0x75, 0x6d, 0x65, 0x6e, 0x74, 0x2e, 0x77,
+  0x72, 0x69, 0x74, 0x65, 0x28, 0x63, 0x68, 0x75, 0x6e, 0x6b, 0x2e, 0x64,
+  0x61, 0x74, 0x61, 0x2e, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x29,
+  0x0a, 0x2f, 0x2f, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x2f, 0x2f, 0x0a,
+  0x65, 0x78, 0x70, 0x6f, 0x72, 0x74, 0x20, 0x61, 0x73, 0x79, 0x6e, 0x63,
+  0x20, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x2a, 0x20, 0x6c,
+  0x6c, 0x61, 0x6d, 0x61, 0x28, 0x70, 0x72, 0x6f, 0x6d, 0x70, 0x74, 0x2c,
+  0x20, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x20, 0x3d, 0x20, 0x7b, 0x7d,
+  0x2c, 0x20, 0x63, 0x6f, 0x6e, 0x66, 0x69, 0x67, 0x20, 0x3d, 0x20, 0x7b,
+  0x7d, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x6c, 0x65, 0x74, 0x20, 0x63,
+  0x6f, 0x6e, 0x74, 0x72, 0x6f, 0x6c, 0x6c, 0x65, 0x72, 0x20, 0x3d, 0x20,
+  0x63, 0x6f, 0x6e, 0x66, 0x69, 0x67, 0x2e, 0x63, 0x6f, 0x6e, 0x74, 0x72,
+  0x6f, 0x6c, 0x6c, 0x65, 0x72, 0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x69, 0x66,
+  0x20, 0x28, 0x21, 0x63, 0x6f, 0x6e, 0x74, 0x72, 0x6f, 0x6c, 0x6c, 0x65,
+  0x72, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e,
+  0x74, 0x72, 0x6f, 0x6c, 0x6c, 0x65, 0x72, 0x20, 0x3d, 0x20, 0x6e, 0x65,
+  0x77, 0x20, 0x41, 0x62, 0x6f, 0x72, 0x74, 0x43, 0x6f, 0x6e, 0x74, 0x72,
+  0x6f, 0x6c, 0x6c, 0x65, 0x72, 0x28, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x7d,
+  0x0a, 0x0a, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x63, 0x6f,
+  0x6d, 0x70, 0x6c, 0x65, 0x74, 0x69, 0x6f, 0x6e, 0x50, 0x61, 0x72, 0x61,
+  0x6d, 0x73, 0x20, 0x3d, 0x20, 0x7b, 0x20, 0x2e, 0x2e, 0x2e, 0x70, 0x61,
+  0x72, 0x61, 0x6d, 0x44, 0x65, 0x66, 0x61, 0x75, 0x6c, 0x74, 0x73, 0x2c,
+  0x20, 0x2e, 0x2e, 0x2e, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2c, 0x20,
+  0x70, 0x72, 0x6f, 0x6d, 0x70, 0x74, 0x20, 0x7d, 0x3b, 0x0a, 0x0a, 0x20,
+  0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x72, 0x65, 0x73, 0x70, 0x6f,
+  0x6e, 0x73, 0x65, 0x20, 0x3d, 0x20, 0x61, 0x77, 0x61, 0x69, 0x74, 0x20,
+  0x66, 0x65, 0x74, 0x63, 0x68, 0x28, 0x22, 0x2f, 0x63, 0x6f, 0x6d, 0x70,
+  0x6c, 0x65, 0x74, 0x69, 0x6f, 0x6e, 0x22, 0x2c, 0x20, 0x7b, 0x0a, 0x20,
+  0x20, 0x20, 0x20, 0x6d, 0x65, 0x74, 0x68, 0x6f, 0x64, 0x3a, 0x20, 0x27,
+  0x50, 0x4f, 0x53, 0x54, 0x27, 0x2c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x62,
+  0x6f, 0x64, 0x79, 0x3a, 0x20, 0x4a, 0x53, 0x4f, 0x4e, 0x2e, 0x73, 0x74,
+  0x72, 0x69, 0x6e, 0x67, 0x69, 0x66, 0x79, 0x28, 0x63, 0x6f, 0x6d, 0x70,
+  0x6c, 0x65, 0x74, 0x69, 0x6f, 0x6e, 0x50, 0x61, 0x72, 0x61, 0x6d, 0x73,
+  0x29, 0x2c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x68, 0x65, 0x61, 0x64, 0x65,
+  0x72, 0x73, 0x3a, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x27, 0x43, 0x6f, 0x6e, 0x6e, 0x65, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x27,
+  0x3a, 0x20, 0x27, 0x6b, 0x65, 0x65, 0x70, 0x2d, 0x61, 0x6c, 0x69, 0x76,
+  0x65, 0x27, 0x2c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x27, 0x43,
+  0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x2d, 0x54, 0x79, 0x70, 0x65, 0x27,
+  0x3a, 0x20, 0x27, 0x61, 0x70, 0x70, 0x6c, 0x69, 0x63, 0x61, 0x74, 0x69,
+  0x6f, 0x6e, 0x2f, 0x6a, 0x73, 0x6f, 0x6e, 0x27, 0x2c, 0x0a, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x27, 0x41, 0x63, 0x63, 0x65, 0x70, 0x74, 0x27,
+  0x3a, 0x20, 0x27, 0x74, 0x65, 0x78, 0x74, 0x2f, 0x65, 0x76, 0x65, 0x6e,
+  0x74, 0x2d, 0x73, 0x74, 0x72, 0x65, 0x61, 0x6d, 0x27, 0x0a, 0x20, 0x20,
+  0x20, 0x20, 0x7d, 0x2c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x73, 0x69, 0x67,
+  0x6e, 0x61, 0x6c, 0x3a, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x72, 0x6f, 0x6c,
+  0x6c, 0x65, 0x72, 0x2e, 0x73, 0x69, 0x67, 0x6e, 0x61, 0x6c, 0x2c, 0x0a,
+  0x20, 0x20, 0x7d, 0x29, 0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x63, 0x6f, 0x6e,
+  0x73, 0x74, 0x20, 0x72, 0x65, 0x61, 0x64, 0x65, 0x72, 0x20, 0x3d, 0x20,
+  0x72, 0x65, 0x73, 0x70, 0x6f, 0x6e, 0x73, 0x65, 0x2e, 0x62, 0x6f, 0x64,
+  0x79, 0x2e, 0x67, 0x65, 0x74, 0x52, 0x65, 0x61, 0x64, 0x65, 0x72, 0x28,
+  0x29, 0x3b, 0x0a, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x64,
+  0x65, 0x63, 0x6f, 0x64, 0x65, 0x72, 0x20, 0x3d, 0x20, 0x6e, 0x65, 0x77,
+  0x20, 0x54, 0x65, 0x78, 0x74, 0x44, 0x65, 0x63, 0x6f, 0x64, 0x65, 0x72,
+  0x28, 0x29, 0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x6c, 0x65, 0x74, 0x20, 0x63,
+  0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x20, 0x3d, 0x20, 0x22, 0x22, 0x3b,
+  0x0a, 0x0a, 0x20, 0x20, 0x74, 0x72, 0x79, 0x20, 0x7b, 0x0a, 0x20, 0x20,
+  0x20, 0x20, 0x6c, 0x65, 0x74, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x20, 0x3d,
+  0x20, 0x74, 0x72, 0x75, 0x65, 0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20,
+  0x77, 0x68, 0x69, 0x6c, 0x65, 0x20, 0x28, 0x63, 0x6f, 0x6e, 0x74, 0x29,
+  0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e,
+  0x73, 0x74, 0x20, 0x72, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x20, 0x3d, 0x20,
+  0x61, 0x77, 0x61, 0x69, 0x74, 0x20, 0x72, 0x65, 0x61, 0x64, 0x65, 0x72,
+  0x2e, 0x72, 0x65, 0x61, 0x64, 0x28, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x72, 0x65, 0x73, 0x75, 0x6c,
+  0x74, 0x2e, 0x64, 0x6f, 0x6e, 0x65, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x62, 0x72, 0x65, 0x61, 0x6b, 0x3b,
+  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f, 0x20, 0x73, 0x73, 0x65, 0x20, 0x61,
+  0x6e, 0x73, 0x77, 0x65, 0x72, 0x73, 0x20, 0x69, 0x6e, 0x20, 0x74, 0x68,
+  0x65, 0x20, 0x66, 0x6f, 0x72, 0x6d, 0x20, 0x6d, 0x75, 0x6c, 0x74, 0x69,
+  0x70, 0x6c, 0x65, 0x20, 0x6c, 0x69, 0x6e, 0x65, 0x73, 0x20, 0x6f, 0x66,
+  0x3a, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x5c, 0x6e, 0x20, 0x77, 0x69,
+  0x74, 0x68, 0x20, 0x64, 0x61, 0x74, 0x61, 0x20, 0x61, 0x6c, 0x77, 0x61,
+  0x79, 0x73, 0x20, 0x70, 0x72, 0x65, 0x73, 0x65, 0x6e, 0x74, 0x20, 0x61,
+  0x73, 0x20, 0x61, 0x20, 0x6b, 0x65, 0x79, 0x2e, 0x20, 0x69, 0x6e, 0x20,
+  0x6f, 0x75, 0x72, 0x20, 0x63, 0x61, 0x73, 0x65, 0x20, 0x77, 0x65, 0x0a,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f, 0x20, 0x6d, 0x61, 0x69,
+  0x6e, 0x6c, 0x79, 0x20, 0x63, 0x61, 0x72, 0x65, 0x20, 0x61, 0x62, 0x6f,
+  0x75, 0x74, 0x20, 0x74, 0x68, 0x65, 0x20, 0x64, 0x61, 0x74, 0x61, 0x3a,
+  0x20, 0x6b, 0x65, 0x79, 0x20, 0x68, 0x65, 0x72, 0x65, 0x2c, 0x20, 0x77,
+  0x68, 0x69, 0x63, 0x68, 0x20, 0x77, 0x65, 0x20, 0x65, 0x78, 0x70, 0x65,
+  0x63, 0x74, 0x20, 0x61, 0x73, 0x20, 0x6a, 0x73, 0x6f, 0x6e, 0x0a, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x74,
+  0x65, 0x78, 0x74, 0x20, 0x3d, 0x20, 0x64, 0x65, 0x63, 0x6f, 0x64, 0x65,
+  0x72, 0x2e, 0x64, 0x65, 0x63, 0x6f, 0x64, 0x65, 0x28, 0x72, 0x65, 0x73,
+  0x75, 0x6c, 0x74, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x29, 0x3b, 0x0a,
+  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f, 0x20, 0x70, 0x61,
+  0x72, 0x73, 0x65, 0x20, 0x61, 0x6c, 0x6c, 0x20, 0x73, 0x73, 0x65, 0x20,
+  0x65, 0x76, 0x65, 0x6e, 0x74, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x61,
+  0x64, 0x64, 0x20, 0x74, 0x68, 0x65, 0x6d, 0x20, 0x74, 0x6f, 0x20, 0x72,
+  0x65, 0x73, 0x75, 0x6c, 0x74, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x72, 0x65, 0x67, 0x65, 0x78, 0x20,
+  0x3d, 0x20, 0x2f, 0x5e, 0x28, 0x5c, 0x53, 0x2b, 0x29, 0x3a, 0x5c, 0x73,
+  0x28, 0x2e, 0x2a, 0x29, 0x24, 0x2f, 0x67, 0x6d, 0x3b, 0x0a, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x66, 0x6f, 0x72, 0x20, 0x28, 0x63, 0x6f, 0x6e,
+  0x73, 0x74, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x20, 0x6f, 0x66, 0x20,
+  0x74, 0x65, 0x78, 0x74, 0x2e, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x41, 0x6c,
+  0x6c, 0x28, 0x72, 0x65, 0x67, 0x65, 0x78, 0x29, 0x29, 0x20, 0x7b, 0x0a,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x73, 0x75,
+  0x6c, 0x74, 0x5b, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x5b, 0x31, 0x5d, 0x5d,
+  0x20, 0x3d, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x5b, 0x32, 0x5d, 0x0a,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x2f, 0x2f, 0x20, 0x73, 0x69, 0x6e, 0x63, 0x65, 0x20,
+  0x77, 0x65, 0x20, 0x6b, 0x6e, 0x6f, 0x77, 0x20, 0x74, 0x68, 0x69, 0x73,
+  0x20, 0x69, 0x73, 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x2e, 0x63, 0x70,
+  0x70, 0x2c, 0x20, 0x6c, 0x65, 0x74, 0x27, 0x73, 0x20, 0x6a, 0x75, 0x73,
+  0x74, 0x20, 0x64, 0x65, 0x63, 0x6f, 0x64, 0x65, 0x20, 0x74, 0x68, 0x65,
+  0x20, 0x6a, 0x73, 0x6f, 0x6e, 0x20, 0x69, 0x6e, 0x20, 0x64, 0x61, 0x74,
+  0x61, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x73, 0x75,
+  0x6c, 0x74, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x20, 0x3d, 0x20, 0x4a, 0x53,
+  0x4f, 0x4e, 0x2e, 0x70, 0x61, 0x72, 0x73, 0x65, 0x28, 0x72, 0x65, 0x73,
+  0x75, 0x6c, 0x74, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x29, 0x3b, 0x0a, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74,
+  0x20, 0x2b, 0x3d, 0x20, 0x72, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x2e, 0x64,
+  0x61, 0x74, 0x61, 0x2e, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x3b,
+  0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f, 0x20, 0x79,
+  0x69, 0x65, 0x6c, 0x64, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x79,
+  0x69, 0x65, 0x6c, 0x64, 0x20, 0x72, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x3b,
+  0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f, 0x20, 0x69,
+  0x66, 0x20, 0x77, 0x65, 0x20, 0x67, 0x6f, 0x74, 0x20, 0x61, 0x20, 0x73,
+  0x74, 0x6f, 0x70, 0x20, 0x74, 0x6f, 0x6b, 0x65, 0x6e, 0x20, 0x66, 0x72,
+  0x6f, 0x6d, 0x20, 0x73, 0x65, 0x72, 0x76, 0x65, 0x72, 0x2c, 0x20, 0x77,
+  0x65, 0x20, 0x77, 0x69, 0x6c, 0x6c, 0x20, 0x62, 0x72, 0x65, 0x61, 0x6b,
+  0x20, 0x68, 0x65, 0x72, 0x65, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x69, 0x66, 0x20, 0x28, 0x72, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x2e, 0x64,
+  0x61, 0x74, 0x61, 0x2e, 0x73, 0x74, 0x6f, 0x70, 0x29, 0x20, 0x7b, 0x0a,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28,
+  0x72, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x2e,
+  0x67, 0x65, 0x6e, 0x65, 0x72, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x5f, 0x73,
+  0x65, 0x74, 0x74, 0x69, 0x6e, 0x67, 0x73, 0x29, 0x20, 0x7b, 0x0a, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x67, 0x65, 0x6e,
+  0x65, 0x72, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x5f, 0x73, 0x65, 0x74, 0x74,
+  0x69, 0x6e, 0x67, 0x73, 0x20, 0x3d, 0x20, 0x72, 0x65, 0x73, 0x75, 0x6c,
+  0x74, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x2e, 0x67, 0x65, 0x6e, 0x65, 0x72,
+  0x61, 0x74, 0x69, 0x6f, 0x6e, 0x5f, 0x73, 0x65, 0x74, 0x74, 0x69, 0x6e,
+  0x67, 0x73, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x62, 0x72,
+  0x65, 0x61, 0x6b, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d,
+  0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x7d, 0x20, 0x63,
+  0x61, 0x74, 0x63, 0x68, 0x20, 0x28, 0x65, 0x29, 0x20, 0x7b, 0x0a, 0x20,
+  0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x65, 0x2e, 0x6e, 0x61, 0x6d,
+  0x65, 0x20, 0x21, 0x3d, 0x3d, 0x20, 0x27, 0x41, 0x62, 0x6f, 0x72, 0x74,
+  0x45, 0x72, 0x72, 0x6f, 0x72, 0x27, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x6f, 0x6c, 0x65, 0x2e,
+  0x65, 0x72, 0x72, 0x6f, 0x72, 0x28, 0x22, 0x6c, 0x6c, 0x61, 0x6d, 0x61,
+  0x20, 0x65, 0x72, 0x72, 0x6f, 0x72, 0x3a, 0x20, 0x22, 0x2c, 0x20, 0x65,
+  0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20,
+  0x20, 0x74, 0x68, 0x72, 0x6f, 0x77, 0x20, 0x65, 0x3b, 0x0a, 0x20, 0x20,
+  0x7d, 0x0a, 0x20, 0x20, 0x66, 0x69, 0x6e, 0x61, 0x6c, 0x6c, 0x79, 0x20,
+  0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x72, 0x6f,
+  0x6c, 0x6c, 0x65, 0x72, 0x2e, 0x61, 0x62, 0x6f, 0x72, 0x74, 0x28, 0x29,
+  0x3b, 0x0a, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x72, 0x65, 0x74,
+  0x75, 0x72, 0x6e, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x3b,
+  0x0a, 0x7d, 0x0a, 0x0a, 0x2f, 0x2f, 0x20, 0x43, 0x61, 0x6c, 0x6c, 0x20,
+  0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x2c, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72,
+  0x6e, 0x20, 0x61, 0x6e, 0x20, 0x65, 0x76, 0x65, 0x6e, 0x74, 0x20, 0x74,
+  0x61, 0x72, 0x67, 0x65, 0x74, 0x20, 0x74, 0x68, 0x61, 0x74, 0x20, 0x79,
+  0x6f, 0x75, 0x20, 0x63, 0x61, 0x6e, 0x20, 0x73, 0x75, 0x62, 0x63, 0x72,
+  0x69, 0x62, 0x65, 0x20, 0x74, 0x6f, 0x0a, 0x2f, 0x2f, 0x0a, 0x2f, 0x2f,
+  0x20, 0x45, 0x78, 0x61, 0x6d, 0x70, 0x6c, 0x65, 0x3a, 0x0a, 0x2f, 0x2f,
+  0x0a, 0x2f, 0x2f, 0x20, 0x20, 0x20, 0x20, 0x69, 0x6d, 0x70, 0x6f, 0x72,
+  0x74, 0x20, 0x7b, 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x45, 0x76, 0x65,
+  0x6e, 0x74, 0x54, 0x61, 0x72, 0x67, 0x65, 0x74, 0x20, 0x7d, 0x20, 0x66,
+  0x72, 0x6f, 0x6d, 0x20, 0x27, 0x2f, 0x63, 0x6f, 0x6d, 0x70, 0x6c, 0x65,
+  0x74, 0x69, 0x6f, 0x6e, 0x2e, 0x6a, 0x73, 0x27, 0x0a, 0x2f, 0x2f, 0x0a,
+  0x2f, 0x2f, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20,
+  0x63, 0x6f, 0x6e, 0x6e, 0x20, 0x3d, 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61,
+  0x45, 0x76, 0x65, 0x6e, 0x74, 0x54, 0x61, 0x72, 0x67, 0x65, 0x74, 0x28,
+  0x70, 0x72, 0x6f, 0x6d, 0x70, 0x74, 0x29, 0x0a, 0x2f, 0x2f, 0x20, 0x20,
+  0x20, 0x20, 0x63, 0x6f, 0x6e, 0x6e, 0x2e, 0x61, 0x64, 0x64, 0x45, 0x76,
+  0x65, 0x6e, 0x74, 0x4c, 0x69, 0x73, 0x74, 0x65, 0x6e, 0x65, 0x72, 0x28,
+  0x22, 0x6d, 0x65, 0x73, 0x73, 0x61, 0x67, 0x65, 0x22, 0x2c, 0x20, 0x28,
+  0x63, 0x68, 0x75, 0x6e, 0x6b, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a,
+  0x2f, 0x2f, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x64, 0x6f, 0x63, 0x75,
+  0x6d, 0x65, 0x6e, 0x74, 0x2e, 0x77, 0x72, 0x69, 0x74, 0x65, 0x28, 0x63,
+  0x68, 0x75, 0x6e, 0x6b, 0x2e, 0x64, 0x65, 0x74, 0x61, 0x69, 0x6c, 0x2e,
+  0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x29, 0x0a, 0x2f, 0x2f, 0x20,
+  0x20, 0x20, 0x20, 0x7d, 0x29, 0x0a, 0x2f, 0x2f, 0x0a, 0x65, 0x78, 0x70,
+  0x6f, 0x72, 0x74, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x6c, 0x6c,
+  0x61, 0x6d, 0x61, 0x45, 0x76, 0x65, 0x6e, 0x74, 0x54, 0x61, 0x72, 0x67,
+  0x65, 0x74, 0x20, 0x3d, 0x20, 0x28, 0x70, 0x72, 0x6f, 0x6d, 0x70, 0x74,
+  0x2c, 0x20, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x20, 0x3d, 0x20, 0x7b,
+  0x7d, 0x2c, 0x20, 0x63, 0x6f, 0x6e, 0x66, 0x69, 0x67, 0x20, 0x3d, 0x20,
+  0x7b, 0x7d, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x63,
+  0x6f, 0x6e, 0x73, 0x74, 0x20, 0x65, 0x76, 0x65, 0x6e, 0x74, 0x54, 0x61,
+  0x72, 0x67, 0x65, 0x74, 0x20, 0x3d, 0x20, 0x6e, 0x65, 0x77, 0x20, 0x45,
+  0x76, 0x65, 0x6e, 0x74, 0x54, 0x61, 0x72, 0x67, 0x65, 0x74, 0x28, 0x29,
+  0x3b, 0x0a, 0x20, 0x20, 0x28, 0x61, 0x73, 0x79, 0x6e, 0x63, 0x20, 0x28,
+  0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x6c,
+  0x65, 0x74, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x20, 0x3d,
+  0x20, 0x22, 0x22, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x66, 0x6f, 0x72,
+  0x20, 0x61, 0x77, 0x61, 0x69, 0x74, 0x20, 0x28, 0x63, 0x6f, 0x6e, 0x73,
+  0x74, 0x20, 0x63, 0x68, 0x75, 0x6e, 0x6b, 0x20, 0x6f, 0x66, 0x20, 0x6c,
+  0x6c, 0x61, 0x6d, 0x61, 0x28, 0x70, 0x72, 0x6f, 0x6d, 0x70, 0x74, 0x2c,
+  0x20, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2c, 0x20, 0x63, 0x6f, 0x6e,
+  0x66, 0x69, 0x67, 0x29, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x63, 0x68, 0x75, 0x6e, 0x6b, 0x2e,
+  0x64, 0x61, 0x74, 0x61, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x20,
+  0x2b, 0x3d, 0x20, 0x63, 0x68, 0x75, 0x6e, 0x6b, 0x2e, 0x64, 0x61, 0x74,
+  0x61, 0x2e, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x3b, 0x0a, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x65, 0x76, 0x65, 0x6e, 0x74,
+  0x54, 0x61, 0x72, 0x67, 0x65, 0x74, 0x2e, 0x64, 0x69, 0x73, 0x70, 0x61,
+  0x74, 0x63, 0x68, 0x45, 0x76, 0x65, 0x6e, 0x74, 0x28, 0x6e, 0x65, 0x77,
+  0x20, 0x43, 0x75, 0x73, 0x74, 0x6f, 0x6d, 0x45, 0x76, 0x65, 0x6e, 0x74,
+  0x28, 0x22, 0x6d, 0x65, 0x73, 0x73, 0x61, 0x67, 0x65, 0x22, 0x2c, 0x20,
+  0x7b, 0x20, 0x64, 0x65, 0x74, 0x61, 0x69, 0x6c, 0x3a, 0x20, 0x63, 0x68,
+  0x75, 0x6e, 0x6b, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x20, 0x7d, 0x29, 0x29,
+  0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x63, 0x68, 0x75, 0x6e,
+  0x6b, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x2e, 0x67, 0x65, 0x6e, 0x65, 0x72,
+  0x61, 0x74, 0x69, 0x6f, 0x6e, 0x5f, 0x73, 0x65, 0x74, 0x74, 0x69, 0x6e,
+  0x67, 0x73, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x65, 0x76, 0x65, 0x6e, 0x74, 0x54, 0x61, 0x72, 0x67, 0x65,
+  0x74, 0x2e, 0x64, 0x69, 0x73, 0x70, 0x61, 0x74, 0x63, 0x68, 0x45, 0x76,
+  0x65, 0x6e, 0x74, 0x28, 0x6e, 0x65, 0x77, 0x20, 0x43, 0x75, 0x73, 0x74,
+  0x6f, 0x6d, 0x45, 0x76, 0x65, 0x6e, 0x74, 0x28, 0x22, 0x67, 0x65, 0x6e,
+  0x65, 0x72, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x5f, 0x73, 0x65, 0x74, 0x74,
+  0x69, 0x6e, 0x67, 0x73, 0x22, 0x2c, 0x20, 0x7b, 0x20, 0x64, 0x65, 0x74,
+  0x61, 0x69, 0x6c, 0x3a, 0x20, 0x63, 0x68, 0x75, 0x6e, 0x6b, 0x2e, 0x64,
+  0x61, 0x74, 0x61, 0x2e, 0x67, 0x65, 0x6e, 0x65, 0x72, 0x61, 0x74, 0x69,
+  0x6f, 0x6e, 0x5f, 0x73, 0x65, 0x74, 0x74, 0x69, 0x6e, 0x67, 0x73, 0x20,
+  0x7d, 0x29, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d,
+  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x63,
+  0x68, 0x75, 0x6e, 0x6b, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x2e, 0x74, 0x69,
+  0x6d, 0x69, 0x6e, 0x67, 0x73, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x65, 0x76, 0x65, 0x6e, 0x74, 0x54, 0x61,
+  0x72, 0x67, 0x65, 0x74, 0x2e, 0x64, 0x69, 0x73, 0x70, 0x61, 0x74, 0x63,
+  0x68, 0x45, 0x76, 0x65, 0x6e, 0x74, 0x28, 0x6e, 0x65, 0x77, 0x20, 0x43,
+  0x75, 0x73, 0x74, 0x6f, 0x6d, 0x45, 0x76, 0x65, 0x6e, 0x74, 0x28, 0x22,
+  0x74, 0x69, 0x6d, 0x69, 0x6e, 0x67, 0x73, 0x22, 0x2c, 0x20, 0x7b, 0x20,
+  0x64, 0x65, 0x74, 0x61, 0x69, 0x6c, 0x3a, 0x20, 0x63, 0x68, 0x75, 0x6e,
+  0x6b, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x2e, 0x74, 0x69, 0x6d, 0x69, 0x6e,
+  0x67, 0x73, 0x20, 0x7d, 0x29, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20,
+  0x20, 0x20, 0x65, 0x76, 0x65, 0x6e, 0x74, 0x54, 0x61, 0x72, 0x67, 0x65,
+  0x74, 0x2e, 0x64, 0x69, 0x73, 0x70, 0x61, 0x74, 0x63, 0x68, 0x45, 0x76,
+  0x65, 0x6e, 0x74, 0x28, 0x6e, 0x65, 0x77, 0x20, 0x43, 0x75, 0x73, 0x74,
+  0x6f, 0x6d, 0x45, 0x76, 0x65, 0x6e, 0x74, 0x28, 0x22, 0x64, 0x6f, 0x6e,
+  0x65, 0x22, 0x2c, 0x20, 0x7b, 0x20, 0x64, 0x65, 0x74, 0x61, 0x69, 0x6c,
+  0x3a, 0x20, 0x7b, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x20,
+  0x7d, 0x20, 0x7d, 0x29, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x7d, 0x29, 0x28,
+  0x29, 0x3b, 0x0a, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20,
+  0x65, 0x76, 0x65, 0x6e, 0x74, 0x54, 0x61, 0x72, 0x67, 0x65, 0x74, 0x3b,
+  0x0a, 0x7d, 0x0a, 0x0a, 0x2f, 0x2f, 0x20, 0x43, 0x61, 0x6c, 0x6c, 0x20,
+  0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x2c, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72,
+  0x6e, 0x20, 0x61, 0x20, 0x70, 0x72, 0x6f, 0x6d, 0x69, 0x73, 0x65, 0x20,
+  0x74, 0x68, 0x61, 0x74, 0x20, 0x72, 0x65, 0x73, 0x6f, 0x6c, 0x76, 0x65,
+  0x73, 0x20, 0x74, 0x6f, 0x20, 0x74, 0x68, 0x65, 0x20, 0x63, 0x6f, 0x6d,
+  0x70, 0x6c, 0x65, 0x74, 0x65, 0x64, 0x20, 0x74, 0x65, 0x78, 0x74, 0x2e,
+  0x20, 0x54, 0x68, 0x69, 0x73, 0x20, 0x64, 0x6f, 0x65, 0x73, 0x20, 0x6e,
+  0x6f, 0x74, 0x20, 0x73, 0x75, 0x70, 0x70, 0x6f, 0x72, 0x74, 0x20, 0x73,
+  0x74, 0x72, 0x65, 0x61, 0x6d, 0x69, 0x6e, 0x67, 0x0a, 0x2f, 0x2f, 0x0a,
+  0x2f, 0x2f, 0x20, 0x45, 0x78, 0x61, 0x6d, 0x70, 0x6c, 0x65, 0x3a, 0x0a,
+  0x2f, 0x2f, 0x0a, 0x2f, 0x2f, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6c, 0x6c,
+  0x61, 0x6d, 0x61, 0x50, 0x72, 0x6f, 0x6d, 0x69, 0x73, 0x65, 0x28, 0x70,
+  0x72, 0x6f, 0x6d, 0x70, 0x74, 0x29, 0x2e, 0x74, 0x68, 0x65, 0x6e, 0x28,
+  0x28, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x29, 0x20, 0x3d, 0x3e,
+  0x20, 0x7b, 0x0a, 0x2f, 0x2f, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x64, 0x6f, 0x63, 0x75, 0x6d, 0x65, 0x6e, 0x74, 0x2e, 0x77, 0x72, 0x69,
+  0x74, 0x65, 0x28, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x29, 0x0a,
+  0x2f, 0x2f, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x29, 0x0a, 0x2f, 0x2f,
+  0x0a, 0x2f, 0x2f, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6f, 0x72, 0x0a, 0x2f,
+  0x2f, 0x0a, 0x2f, 0x2f, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e,
+  0x73, 0x74, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x20, 0x3d,
+  0x20, 0x61, 0x77, 0x61, 0x69, 0x74, 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61,
+  0x50, 0x72, 0x6f, 0x6d, 0x69, 0x73, 0x65, 0x28, 0x70, 0x72, 0x6f, 0x6d,
+  0x70, 0x74, 0x29, 0x0a, 0x2f, 0x2f, 0x20, 0x20, 0x20, 0x20, 0x20, 0x64,
+  0x6f, 0x63, 0x75, 0x6d, 0x65, 0x6e, 0x74, 0x2e, 0x77, 0x72, 0x69, 0x74,
+  0x65, 0x28, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x29, 0x0a, 0x2f,
+  0x2f, 0x0a, 0x65, 0x78, 0x70, 0x6f, 0x72, 0x74, 0x20, 0x63, 0x6f, 0x6e,
+  0x73, 0x74, 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x50, 0x72, 0x6f, 0x6d,
+  0x69, 0x73, 0x65, 0x20, 0x3d, 0x20, 0x28, 0x70, 0x72, 0x6f, 0x6d, 0x70,
+  0x74, 0x2c, 0x20, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x20, 0x3d, 0x20,
+  0x7b, 0x7d, 0x2c, 0x20, 0x63, 0x6f, 0x6e, 0x66, 0x69, 0x67, 0x20, 0x3d,
+  0x20, 0x7b, 0x7d, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a, 0x20, 0x20,
+  0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x6e, 0x65, 0x77, 0x20, 0x50,
+  0x72, 0x6f, 0x6d, 0x69, 0x73, 0x65, 0x28, 0x61, 0x73, 0x79, 0x6e, 0x63,
+  0x20, 0x28, 0x72, 0x65, 0x73, 0x6f, 0x6c, 0x76, 0x65, 0x2c, 0x20, 0x72,
+  0x65, 0x6a, 0x65, 0x63, 0x74, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a,
+  0x20, 0x20, 0x20, 0x20, 0x6c, 0x65, 0x74, 0x20, 0x63, 0x6f, 0x6e, 0x74,
+  0x65, 0x6e, 0x74, 0x20, 0x3d, 0x20, 0x22, 0x22, 0x3b, 0x0a, 0x20, 0x20,
+  0x20, 0x20, 0x74, 0x72, 0x79, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x66, 0x6f, 0x72, 0x20, 0x61, 0x77, 0x61, 0x69, 0x74, 0x20,
+  0x28, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x63, 0x68, 0x75, 0x6e, 0x6b,
+  0x20, 0x6f, 0x66, 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x28, 0x70, 0x72,
+  0x6f, 0x6d, 0x70, 0x74, 0x2c, 0x20, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73,
+  0x2c, 0x20, 0x63, 0x6f, 0x6e, 0x66, 0x69, 0x67, 0x29, 0x29, 0x20, 0x7b,
+  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e,
+  0x74, 0x65, 0x6e, 0x74, 0x20, 0x2b, 0x3d, 0x20, 0x63, 0x68, 0x75, 0x6e,
+  0x6b, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x2e, 0x63, 0x6f, 0x6e, 0x74, 0x65,
+  0x6e, 0x74, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x73, 0x6f, 0x6c, 0x76,
+  0x65, 0x28, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x29, 0x3b, 0x0a,
+  0x20, 0x20, 0x20, 0x20, 0x7d, 0x20, 0x63, 0x61, 0x74, 0x63, 0x68, 0x20,
+  0x28, 0x65, 0x72, 0x72, 0x6f, 0x72, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x6a, 0x65, 0x63, 0x74, 0x28, 0x65,
+  0x72, 0x72, 0x6f, 0x72, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d,
+  0x0a, 0x20, 0x20, 0x7d, 0x29, 0x3b, 0x0a, 0x7d, 0x3b, 0x0a, 0x0a, 0x2f,
+  0x2a, 0x2a, 0x0a, 0x20, 0x2a, 0x20, 0x28, 0x64, 0x65, 0x70, 0x72, 0x65,
+  0x63, 0x61, 0x74, 0x65, 0x64, 0x29, 0x0a, 0x20, 0x2a, 0x2f, 0x0a, 0x65,
+  0x78, 0x70, 0x6f, 0x72, 0x74, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20,
+  0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x43, 0x6f, 0x6d, 0x70, 0x6c, 0x65, 0x74,
+  0x65, 0x20, 0x3d, 0x20, 0x61, 0x73, 0x79, 0x6e, 0x63, 0x20, 0x28, 0x70,
+  0x61, 0x72, 0x61, 0x6d, 0x73, 0x2c, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x72,
+  0x6f, 0x6c, 0x6c, 0x65, 0x72, 0x2c, 0x20, 0x63, 0x61, 0x6c, 0x6c, 0x62,
+  0x61, 0x63, 0x6b, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a, 0x20, 0x20,
+  0x66, 0x6f, 0x72, 0x20, 0x61, 0x77, 0x61, 0x69, 0x74, 0x20, 0x28, 0x63,
+  0x6f, 0x6e, 0x73, 0x74, 0x20, 0x63, 0x68, 0x75, 0x6e, 0x6b, 0x20, 0x6f,
+  0x66, 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x28, 0x70, 0x61, 0x72, 0x61,
+  0x6d, 0x73, 0x2e, 0x70, 0x72, 0x6f, 0x6d, 0x70, 0x74, 0x2c, 0x20, 0x70,
+  0x61, 0x72, 0x61, 0x6d, 0x73, 0x2c, 0x20, 0x7b, 0x20, 0x63, 0x6f, 0x6e,
+  0x74, 0x72, 0x6f, 0x6c, 0x6c, 0x65, 0x72, 0x20, 0x7d, 0x29, 0x29, 0x20,
+  0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x61, 0x6c, 0x6c, 0x62, 0x61,
+  0x63, 0x6b, 0x28, 0x63, 0x68, 0x75, 0x6e, 0x6b, 0x29, 0x3b, 0x0a, 0x20,
+  0x20, 0x7d, 0x0a, 0x7d, 0x0a, 0x0a, 0x2f, 0x2f, 0x20, 0x47, 0x65, 0x74,
+  0x20, 0x74, 0x68, 0x65, 0x20, 0x6d, 0x6f, 0x64, 0x65, 0x6c, 0x20, 0x69,
+  0x6e, 0x66, 0x6f, 0x20, 0x66, 0x72, 0x6f, 0x6d, 0x20, 0x74, 0x68, 0x65,
+  0x20, 0x73, 0x65, 0x72, 0x76, 0x65, 0x72, 0x2e, 0x20, 0x54, 0x68, 0x69,
+  0x73, 0x20, 0x69, 0x73, 0x20, 0x75, 0x73, 0x65, 0x66, 0x75, 0x6c, 0x20,
+  0x66, 0x6f, 0x72, 0x20, 0x67, 0x65, 0x74, 0x74, 0x69, 0x6e, 0x67, 0x20,
+  0x74, 0x68, 0x65, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x78, 0x74, 0x20,
+  0x77, 0x69, 0x6e, 0x64, 0x6f, 0x77, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x73,
+  0x6f, 0x20, 0x6f, 0x6e, 0x2e, 0x0a, 0x65, 0x78, 0x70, 0x6f, 0x72, 0x74,
+  0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61,
+  0x4d, 0x6f, 0x64, 0x65, 0x6c, 0x49, 0x6e, 0x66, 0x6f, 0x20, 0x3d, 0x20,
+  0x61, 0x73, 0x79, 0x6e, 0x63, 0x20, 0x28, 0x29, 0x20, 0x3d, 0x3e, 0x20,
+  0x7b, 0x0a, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x21, 0x67, 0x65, 0x6e,
+  0x65, 0x72, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x5f, 0x73, 0x65, 0x74, 0x74,
+  0x69, 0x6e, 0x67, 0x73, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20,
+  0x67, 0x65, 0x6e, 0x65, 0x72, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x5f, 0x73,
+  0x65, 0x74, 0x74, 0x69, 0x6e, 0x67, 0x73, 0x20, 0x3d, 0x20, 0x61, 0x77,
+  0x61, 0x69, 0x74, 0x20, 0x66, 0x65, 0x74, 0x63, 0x68, 0x28, 0x22, 0x2f,
+  0x6d, 0x6f, 0x64, 0x65, 0x6c, 0x2e, 0x6a, 0x73, 0x6f, 0x6e, 0x22, 0x29,
+  0x2e, 0x74, 0x68, 0x65, 0x6e, 0x28, 0x72, 0x20, 0x3d, 0x3e, 0x20, 0x72,
+  0x2e, 0x6a, 0x73, 0x6f, 0x6e, 0x28, 0x29, 0x29, 0x3b, 0x0a, 0x20, 0x20,
+  0x7d, 0x0a, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x67,
+  0x65, 0x6e, 0x65, 0x72, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x5f, 0x73, 0x65,
+  0x74, 0x74, 0x69, 0x6e, 0x67, 0x73, 0x3b, 0x0a, 0x7d, 0x0a
+};
+unsigned int completion_js_len = 4462;
diff --git a/examples/server/deps.sh b/examples/server/deps.sh
new file mode 100755
index 000000000..1e9fe964b
--- /dev/null
+++ b/examples/server/deps.sh
@@ -0,0 +1,18 @@
+#!/bin/bash
+# Download and update deps for binary
+
+# get the directory of this script file
+DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
+PUBLIC=$DIR/public
+
+echo "download js bundle files"
+curl https://npm.reversehttp.com/@preact/signals-core,@preact/signals,htm/preact,preact,preact/hooks > $PUBLIC/index.js
+echo >> $PUBLIC/index.js # add newline
+
+FILES=$(ls $PUBLIC)
+
+for FILE in $FILES; do
+  func=$(echo $FILE | tr '.' '_')
+  echo "generate $FILE.hpp ($func)"
+  xxd -n $func -i $PUBLIC/$FILE > $DIR/$FILE.hpp
+done
diff --git a/examples/server/index.html.hpp b/examples/server/index.html.hpp
new file mode 100644
index 000000000..42707fad9
--- /dev/null
+++ b/examples/server/index.html.hpp
@@ -0,0 +1,899 @@
+unsigned char index_html[] = {
+  0x3c, 0x68, 0x74, 0x6d, 0x6c, 0x3e, 0x0a, 0x0a, 0x3c, 0x68, 0x65, 0x61,
+  0x64, 0x3e, 0x0a, 0x20, 0x20, 0x3c, 0x6d, 0x65, 0x74, 0x61, 0x20, 0x63,
+  0x68, 0x61, 0x72, 0x73, 0x65, 0x74, 0x3d, 0x22, 0x55, 0x54, 0x46, 0x2d,
+  0x38, 0x22, 0x3e, 0x0a, 0x20, 0x20, 0x3c, 0x6d, 0x65, 0x74, 0x61, 0x20,
+  0x6e, 0x61, 0x6d, 0x65, 0x3d, 0x22, 0x76, 0x69, 0x65, 0x77, 0x70, 0x6f,
+  0x72, 0x74, 0x22, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x3d,
+  0x22, 0x77, 0x69, 0x64, 0x74, 0x68, 0x3d, 0x64, 0x65, 0x76, 0x69, 0x63,
+  0x65, 0x2d, 0x77, 0x69, 0x64, 0x74, 0x68, 0x2c, 0x20, 0x69, 0x6e, 0x69,
+  0x74, 0x69, 0x61, 0x6c, 0x2d, 0x73, 0x63, 0x61, 0x6c, 0x65, 0x3d, 0x31,
+  0x2c, 0x20, 0x6d, 0x61, 0x78, 0x69, 0x6d, 0x75, 0x6d, 0x2d, 0x73, 0x63,
+  0x61, 0x6c, 0x65, 0x3d, 0x31, 0x22, 0x20, 0x2f, 0x3e, 0x0a, 0x20, 0x20,
+  0x3c, 0x74, 0x69, 0x74, 0x6c, 0x65, 0x3e, 0x6c, 0x6c, 0x61, 0x6d, 0x61,
+  0x2e, 0x63, 0x70, 0x70, 0x20, 0x2d, 0x20, 0x63, 0x68, 0x61, 0x74, 0x3c,
+  0x2f, 0x74, 0x69, 0x74, 0x6c, 0x65, 0x3e, 0x0a, 0x0a, 0x20, 0x20, 0x3c,
+  0x73, 0x74, 0x79, 0x6c, 0x65, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x62,
+  0x6f, 0x64, 0x79, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x62, 0x61, 0x63, 0x6b, 0x67, 0x72, 0x6f, 0x75, 0x6e, 0x64, 0x2d, 0x63,
+  0x6f, 0x6c, 0x6f, 0x72, 0x3a, 0x20, 0x23, 0x66, 0x66, 0x66, 0x3b, 0x0a,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6c, 0x6f, 0x72, 0x3a,
+  0x20, 0x23, 0x30, 0x30, 0x30, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x66, 0x6f, 0x6e, 0x74, 0x2d, 0x66, 0x61, 0x6d, 0x69, 0x6c, 0x79,
+  0x3a, 0x20, 0x73, 0x79, 0x73, 0x74, 0x65, 0x6d, 0x2d, 0x75, 0x69, 0x3b,
+  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x66, 0x6f, 0x6e, 0x74, 0x2d,
+  0x73, 0x69, 0x7a, 0x65, 0x3a, 0x20, 0x39, 0x30, 0x25, 0x3b, 0x0a, 0x20,
+  0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x23, 0x63,
+  0x6f, 0x6e, 0x74, 0x61, 0x69, 0x6e, 0x65, 0x72, 0x20, 0x7b, 0x0a, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x6d, 0x61, 0x72, 0x67, 0x69, 0x6e, 0x3a,
+  0x20, 0x30, 0x65, 0x6d, 0x20, 0x61, 0x75, 0x74, 0x6f, 0x3b, 0x0a, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x64, 0x69, 0x73, 0x70, 0x6c, 0x61, 0x79,
+  0x3a, 0x20, 0x66, 0x6c, 0x65, 0x78, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x66, 0x6c, 0x65, 0x78, 0x2d, 0x64, 0x69, 0x72, 0x65, 0x63,
+  0x74, 0x69, 0x6f, 0x6e, 0x3a, 0x20, 0x63, 0x6f, 0x6c, 0x75, 0x6d, 0x6e,
+  0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6a, 0x75, 0x73, 0x74,
+  0x69, 0x66, 0x79, 0x2d, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x3a,
+  0x20, 0x73, 0x70, 0x61, 0x63, 0x65, 0x2d, 0x62, 0x65, 0x74, 0x77, 0x65,
+  0x65, 0x6e, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x68, 0x65,
+  0x69, 0x67, 0x68, 0x74, 0x3a, 0x20, 0x31, 0x30, 0x30, 0x25, 0x3b, 0x0a,
+  0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x6d,
+  0x61, 0x69, 0x6e, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x6d, 0x61, 0x72, 0x67, 0x69, 0x6e, 0x3a, 0x20, 0x33, 0x70, 0x78, 0x3b,
+  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x64, 0x69, 0x73, 0x70, 0x6c,
+  0x61, 0x79, 0x3a, 0x20, 0x66, 0x6c, 0x65, 0x78, 0x3b, 0x0a, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x66, 0x6c, 0x65, 0x78, 0x2d, 0x64, 0x69, 0x72,
+  0x65, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x3a, 0x20, 0x63, 0x6f, 0x6c, 0x75,
+  0x6d, 0x6e, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6a, 0x75,
+  0x73, 0x74, 0x69, 0x66, 0x79, 0x2d, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e,
+  0x74, 0x3a, 0x20, 0x73, 0x70, 0x61, 0x63, 0x65, 0x2d, 0x62, 0x65, 0x74,
+  0x77, 0x65, 0x65, 0x6e, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x67, 0x61, 0x70, 0x3a, 0x20, 0x31, 0x65, 0x6d, 0x3b, 0x0a, 0x0a, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x66, 0x6c, 0x65, 0x78, 0x2d, 0x67, 0x72,
+  0x6f, 0x77, 0x3a, 0x20, 0x31, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x6f, 0x76, 0x65, 0x72, 0x66, 0x6c, 0x6f, 0x77, 0x2d, 0x79, 0x3a,
+  0x20, 0x61, 0x75, 0x74, 0x6f, 0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x62, 0x6f, 0x72, 0x64, 0x65, 0x72, 0x3a, 0x20, 0x31, 0x70,
+  0x78, 0x20, 0x73, 0x6f, 0x6c, 0x69, 0x64, 0x20, 0x23, 0x63, 0x63, 0x63,
+  0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x62, 0x6f, 0x72, 0x64,
+  0x65, 0x72, 0x2d, 0x72, 0x61, 0x64, 0x69, 0x75, 0x73, 0x3a, 0x20, 0x35,
+  0x70, 0x78, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x70, 0x61,
+  0x64, 0x64, 0x69, 0x6e, 0x67, 0x3a, 0x20, 0x30, 0x2e, 0x35, 0x65, 0x6d,
+  0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20,
+  0x20, 0x62, 0x6f, 0x64, 0x79, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x6d, 0x61, 0x78, 0x2d, 0x77, 0x69, 0x64, 0x74, 0x68, 0x3a,
+  0x20, 0x36, 0x30, 0x30, 0x70, 0x78, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x6d, 0x69, 0x6e, 0x2d, 0x77, 0x69, 0x64, 0x74, 0x68, 0x3a,
+  0x20, 0x33, 0x30, 0x30, 0x70, 0x78, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x6c, 0x69, 0x6e, 0x65, 0x2d, 0x68, 0x65, 0x69, 0x67, 0x68,
+  0x74, 0x3a, 0x20, 0x31, 0x2e, 0x32, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x6d, 0x61, 0x72, 0x67, 0x69, 0x6e, 0x3a, 0x20, 0x30, 0x20,
+  0x61, 0x75, 0x74, 0x6f, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x70, 0x61, 0x64, 0x64, 0x69, 0x6e, 0x67, 0x3a, 0x20, 0x30, 0x20, 0x30,
+  0x2e, 0x35, 0x65, 0x6d, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a,
+  0x0a, 0x20, 0x20, 0x20, 0x20, 0x70, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x6f, 0x76, 0x65, 0x72, 0x66, 0x6c, 0x6f, 0x77, 0x2d,
+  0x77, 0x72, 0x61, 0x70, 0x3a, 0x20, 0x62, 0x72, 0x65, 0x61, 0x6b, 0x2d,
+  0x77, 0x6f, 0x72, 0x64, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x77, 0x6f, 0x72, 0x64, 0x2d, 0x77, 0x72, 0x61, 0x70, 0x3a, 0x20, 0x62,
+  0x72, 0x65, 0x61, 0x6b, 0x2d, 0x77, 0x6f, 0x72, 0x64, 0x3b, 0x0a, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x68, 0x79, 0x70, 0x68, 0x65, 0x6e, 0x73,
+  0x3a, 0x20, 0x61, 0x75, 0x74, 0x6f, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x6d, 0x61, 0x72, 0x67, 0x69, 0x6e, 0x2d, 0x74, 0x6f, 0x70,
+  0x3a, 0x20, 0x30, 0x2e, 0x35, 0x65, 0x6d, 0x3b, 0x0a, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x6d, 0x61, 0x72, 0x67, 0x69, 0x6e, 0x2d, 0x62, 0x6f,
+  0x74, 0x74, 0x6f, 0x6d, 0x3a, 0x20, 0x30, 0x2e, 0x35, 0x65, 0x6d, 0x3b,
+  0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20,
+  0x23, 0x77, 0x72, 0x69, 0x74, 0x65, 0x20, 0x66, 0x6f, 0x72, 0x6d, 0x20,
+  0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6d, 0x61, 0x72, 0x67,
+  0x69, 0x6e, 0x3a, 0x20, 0x31, 0x65, 0x6d, 0x20, 0x30, 0x20, 0x30, 0x20,
+  0x30, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x64, 0x69, 0x73,
+  0x70, 0x6c, 0x61, 0x79, 0x3a, 0x20, 0x66, 0x6c, 0x65, 0x78, 0x3b, 0x0a,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x66, 0x6c, 0x65, 0x78, 0x2d, 0x64,
+  0x69, 0x72, 0x65, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x3a, 0x20, 0x63, 0x6f,
+  0x6c, 0x75, 0x6d, 0x6e, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x67, 0x61, 0x70, 0x3a, 0x20, 0x30, 0x2e, 0x35, 0x65, 0x6d, 0x3b, 0x0a,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x61, 0x6c, 0x69, 0x67, 0x6e, 0x2d,
+  0x69, 0x74, 0x65, 0x6d, 0x73, 0x3a, 0x20, 0x73, 0x74, 0x72, 0x65, 0x74,
+  0x63, 0x68, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20,
+  0x20, 0x20, 0x20, 0x2e, 0x72, 0x69, 0x67, 0x68, 0x74, 0x20, 0x7b, 0x0a,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x64, 0x69, 0x73, 0x70, 0x6c, 0x61,
+  0x79, 0x3a, 0x20, 0x66, 0x6c, 0x65, 0x78, 0x3b, 0x0a, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x66, 0x6c, 0x65, 0x78, 0x2d, 0x64, 0x69, 0x72, 0x65,
+  0x63, 0x74, 0x69, 0x6f, 0x6e, 0x3a, 0x20, 0x72, 0x6f, 0x77, 0x3b, 0x0a,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x67, 0x61, 0x70, 0x3a, 0x20, 0x30,
+  0x2e, 0x35, 0x65, 0x6d, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x6a, 0x75, 0x73, 0x74, 0x69, 0x66, 0x79, 0x2d, 0x63, 0x6f, 0x6e, 0x74,
+  0x65, 0x6e, 0x74, 0x3a, 0x20, 0x66, 0x6c, 0x65, 0x78, 0x2d, 0x65, 0x6e,
+  0x64, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20,
+  0x20, 0x20, 0x66, 0x69, 0x65, 0x6c, 0x64, 0x73, 0x65, 0x74, 0x20, 0x7b,
+  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x62, 0x6f, 0x72, 0x64, 0x65,
+  0x72, 0x3a, 0x20, 0x6e, 0x6f, 0x6e, 0x65, 0x3b, 0x0a, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x70, 0x61, 0x64, 0x64, 0x69, 0x6e, 0x67, 0x3a, 0x20,
+  0x30, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6d, 0x61, 0x72,
+  0x67, 0x69, 0x6e, 0x3a, 0x20, 0x30, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20,
+  0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x74, 0x65, 0x78, 0x74, 0x61,
+  0x72, 0x65, 0x61, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x70, 0x61, 0x64, 0x64, 0x69, 0x6e, 0x67, 0x3a, 0x20, 0x35, 0x70, 0x78,
+  0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x66, 0x6c, 0x65, 0x78,
+  0x2d, 0x67, 0x72, 0x6f, 0x77, 0x3a, 0x20, 0x31, 0x3b, 0x0a, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x77, 0x69, 0x64, 0x74, 0x68, 0x3a, 0x20, 0x31,
+  0x30, 0x30, 0x25, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a,
+  0x20, 0x20, 0x20, 0x20, 0x70, 0x72, 0x65, 0x20, 0x63, 0x6f, 0x64, 0x65,
+  0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x64, 0x69, 0x73,
+  0x70, 0x6c, 0x61, 0x79, 0x3a, 0x20, 0x62, 0x6c, 0x6f, 0x63, 0x6b, 0x3b,
+  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x62, 0x61, 0x63, 0x6b, 0x67,
+  0x72, 0x6f, 0x75, 0x6e, 0x64, 0x2d, 0x63, 0x6f, 0x6c, 0x6f, 0x72, 0x3a,
+  0x20, 0x23, 0x32, 0x32, 0x32, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x63, 0x6f, 0x6c, 0x6f, 0x72, 0x3a, 0x20, 0x23, 0x64, 0x64, 0x64,
+  0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20,
+  0x63, 0x6f, 0x64, 0x65, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x66, 0x6f, 0x6e, 0x74, 0x2d, 0x66, 0x61, 0x6d, 0x69, 0x6c, 0x79,
+  0x3a, 0x20, 0x6d, 0x6f, 0x6e, 0x6f, 0x73, 0x70, 0x61, 0x63, 0x65, 0x3b,
+  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x70, 0x61, 0x64, 0x64, 0x69,
+  0x6e, 0x67, 0x3a, 0x20, 0x30, 0x2e, 0x31, 0x65, 0x6d, 0x20, 0x30, 0x2e,
+  0x33, 0x65, 0x6d, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x62,
+  0x6f, 0x72, 0x64, 0x65, 0x72, 0x2d, 0x72, 0x61, 0x64, 0x69, 0x75, 0x73,
+  0x3a, 0x20, 0x33, 0x70, 0x78, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d,
+  0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x66, 0x69, 0x65, 0x6c, 0x64, 0x73,
+  0x65, 0x74, 0x20, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x20, 0x7b, 0x0a, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x6d, 0x61, 0x72, 0x67, 0x69, 0x6e, 0x3a,
+  0x20, 0x30, 0x2e, 0x35, 0x65, 0x6d, 0x20, 0x30, 0x3b, 0x0a, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x64, 0x69, 0x73, 0x70, 0x6c, 0x61, 0x79, 0x3a,
+  0x20, 0x62, 0x6c, 0x6f, 0x63, 0x6b, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20,
+  0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x68, 0x65, 0x61, 0x64, 0x65,
+  0x72, 0x2c, 0x20, 0x66, 0x6f, 0x6f, 0x74, 0x65, 0x72, 0x20, 0x7b, 0x0a,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x74, 0x65, 0x78, 0x74, 0x2d, 0x61,
+  0x6c, 0x69, 0x67, 0x6e, 0x3a, 0x20, 0x63, 0x65, 0x6e, 0x74, 0x65, 0x72,
+  0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20,
+  0x20, 0x66, 0x6f, 0x6f, 0x74, 0x65, 0x72, 0x20, 0x7b, 0x0a, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x66, 0x6f, 0x6e, 0x74, 0x2d, 0x73, 0x69, 0x7a,
+  0x65, 0x3a, 0x20, 0x38, 0x30, 0x25, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x63, 0x6f, 0x6c, 0x6f, 0x72, 0x3a, 0x20, 0x23, 0x38, 0x38,
+  0x38, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x3c,
+  0x2f, 0x73, 0x74, 0x79, 0x6c, 0x65, 0x3e, 0x0a, 0x0a, 0x20, 0x20, 0x3c,
+  0x73, 0x63, 0x72, 0x69, 0x70, 0x74, 0x20, 0x74, 0x79, 0x70, 0x65, 0x3d,
+  0x22, 0x6d, 0x6f, 0x64, 0x75, 0x6c, 0x65, 0x22, 0x3e, 0x0a, 0x20, 0x20,
+  0x20, 0x20, 0x69, 0x6d, 0x70, 0x6f, 0x72, 0x74, 0x20, 0x7b, 0x0a, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x68, 0x74, 0x6d, 0x6c, 0x2c, 0x20, 0x68,
+  0x2c, 0x20, 0x73, 0x69, 0x67, 0x6e, 0x61, 0x6c, 0x2c, 0x20, 0x65, 0x66,
+  0x66, 0x65, 0x63, 0x74, 0x2c, 0x20, 0x63, 0x6f, 0x6d, 0x70, 0x75, 0x74,
+  0x65, 0x64, 0x2c, 0x20, 0x72, 0x65, 0x6e, 0x64, 0x65, 0x72, 0x2c, 0x20,
+  0x75, 0x73, 0x65, 0x53, 0x69, 0x67, 0x6e, 0x61, 0x6c, 0x2c, 0x20, 0x75,
+  0x73, 0x65, 0x45, 0x66, 0x66, 0x65, 0x63, 0x74, 0x2c, 0x20, 0x75, 0x73,
+  0x65, 0x52, 0x65, 0x66, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x20, 0x66,
+  0x72, 0x6f, 0x6d, 0x20, 0x27, 0x2f, 0x69, 0x6e, 0x64, 0x65, 0x78, 0x2e,
+  0x6a, 0x73, 0x27, 0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x69, 0x6d,
+  0x70, 0x6f, 0x72, 0x74, 0x20, 0x7b, 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61,
+  0x20, 0x7d, 0x20, 0x66, 0x72, 0x6f, 0x6d, 0x20, 0x27, 0x2f, 0x63, 0x6f,
+  0x6d, 0x70, 0x6c, 0x65, 0x74, 0x69, 0x6f, 0x6e, 0x2e, 0x6a, 0x73, 0x27,
+  0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74,
+  0x20, 0x73, 0x65, 0x73, 0x73, 0x69, 0x6f, 0x6e, 0x20, 0x3d, 0x20, 0x73,
+  0x69, 0x67, 0x6e, 0x61, 0x6c, 0x28, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x70, 0x72, 0x6f, 0x6d, 0x70, 0x74, 0x3a, 0x20, 0x22, 0x54,
+  0x68, 0x69, 0x73, 0x20, 0x69, 0x73, 0x20, 0x61, 0x20, 0x63, 0x6f, 0x6e,
+  0x76, 0x65, 0x72, 0x73, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x62, 0x65,
+  0x74, 0x77, 0x65, 0x65, 0x6e, 0x20, 0x75, 0x73, 0x65, 0x72, 0x20, 0x61,
+  0x6e, 0x64, 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x2c, 0x20, 0x61, 0x20,
+  0x66, 0x72, 0x69, 0x65, 0x6e, 0x64, 0x6c, 0x79, 0x20, 0x63, 0x68, 0x61,
+  0x74, 0x62, 0x6f, 0x74, 0x2e, 0x20, 0x72, 0x65, 0x73, 0x70, 0x6f, 0x6e,
+  0x64, 0x20, 0x69, 0x6e, 0x20, 0x73, 0x69, 0x6d, 0x70, 0x6c, 0x65, 0x20,
+  0x6d, 0x61, 0x72, 0x6b, 0x64, 0x6f, 0x77, 0x6e, 0x2e, 0x22, 0x2c, 0x0a,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x74, 0x65, 0x6d, 0x70, 0x6c, 0x61,
+  0x74, 0x65, 0x3a, 0x20, 0x22, 0x7b, 0x7b, 0x70, 0x72, 0x6f, 0x6d, 0x70,
+  0x74, 0x7d, 0x7d, 0x5c, 0x6e, 0x5c, 0x6e, 0x7b, 0x7b, 0x68, 0x69, 0x73,
+  0x74, 0x6f, 0x72, 0x79, 0x7d, 0x7d, 0x5c, 0x6e, 0x7b, 0x7b, 0x63, 0x68,
+  0x61, 0x72, 0x7d, 0x7d, 0x3a, 0x22, 0x2c, 0x0a, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x68, 0x69, 0x73, 0x74, 0x6f, 0x72, 0x79, 0x54, 0x65, 0x6d,
+  0x70, 0x6c, 0x61, 0x74, 0x65, 0x3a, 0x20, 0x22, 0x7b, 0x7b, 0x6e, 0x61,
+  0x6d, 0x65, 0x7d, 0x7d, 0x3a, 0x20, 0x7b, 0x7b, 0x6d, 0x65, 0x73, 0x73,
+  0x61, 0x67, 0x65, 0x7d, 0x7d, 0x22, 0x2c, 0x0a, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x74, 0x72, 0x61, 0x6e, 0x73, 0x63, 0x72, 0x69, 0x70, 0x74,
+  0x3a, 0x20, 0x5b, 0x5d, 0x2c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x74, 0x79, 0x70, 0x65, 0x3a, 0x20, 0x22, 0x63, 0x68, 0x61, 0x74, 0x22,
+  0x2c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x68, 0x61, 0x72,
+  0x3a, 0x20, 0x22, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x22, 0x2c, 0x0a, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x75, 0x73, 0x65, 0x72, 0x3a, 0x20, 0x22,
+  0x55, 0x73, 0x65, 0x72, 0x22, 0x2c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d,
+  0x29, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74,
+  0x20, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x20, 0x3d, 0x20, 0x73, 0x69,
+  0x67, 0x6e, 0x61, 0x6c, 0x28, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x6e, 0x5f, 0x70, 0x72, 0x65, 0x64, 0x69, 0x63, 0x74, 0x3a, 0x20,
+  0x34, 0x30, 0x30, 0x2c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x74,
+  0x65, 0x6d, 0x70, 0x65, 0x72, 0x61, 0x74, 0x75, 0x72, 0x65, 0x3a, 0x20,
+  0x30, 0x2e, 0x37, 0x2c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72,
+  0x65, 0x70, 0x65, 0x61, 0x74, 0x5f, 0x6c, 0x61, 0x73, 0x74, 0x5f, 0x6e,
+  0x3a, 0x20, 0x32, 0x35, 0x36, 0x2c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x72, 0x65, 0x70, 0x65, 0x61, 0x74, 0x5f, 0x70, 0x65, 0x6e, 0x61,
+  0x6c, 0x74, 0x79, 0x3a, 0x20, 0x31, 0x2e, 0x31, 0x38, 0x2c, 0x0a, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x74, 0x6f, 0x70, 0x5f, 0x6b, 0x3a, 0x20,
+  0x34, 0x30, 0x2c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x74, 0x6f,
+  0x70, 0x5f, 0x70, 0x3a, 0x20, 0x30, 0x2e, 0x35, 0x2c, 0x0a, 0x20, 0x20,
+  0x20, 0x20, 0x7d, 0x29, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f,
+  0x6e, 0x73, 0x74, 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x53, 0x74, 0x61,
+  0x74, 0x73, 0x20, 0x3d, 0x20, 0x73, 0x69, 0x67, 0x6e, 0x61, 0x6c, 0x28,
+  0x6e, 0x75, 0x6c, 0x6c, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f,
+  0x6e, 0x73, 0x74, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x72, 0x6f, 0x6c, 0x6c,
+  0x65, 0x72, 0x20, 0x3d, 0x20, 0x73, 0x69, 0x67, 0x6e, 0x61, 0x6c, 0x28,
+  0x6e, 0x75, 0x6c, 0x6c, 0x29, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63,
+  0x6f, 0x6e, 0x73, 0x74, 0x20, 0x67, 0x65, 0x6e, 0x65, 0x72, 0x61, 0x74,
+  0x69, 0x6e, 0x67, 0x20, 0x3d, 0x20, 0x63, 0x6f, 0x6d, 0x70, 0x75, 0x74,
+  0x65, 0x64, 0x28, 0x28, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x63, 0x6f, 0x6e,
+  0x74, 0x72, 0x6f, 0x6c, 0x6c, 0x65, 0x72, 0x2e, 0x76, 0x61, 0x6c, 0x75,
+  0x65, 0x20, 0x3d, 0x3d, 0x20, 0x6e, 0x75, 0x6c, 0x6c, 0x20, 0x29, 0x0a,
+  0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x63, 0x68,
+  0x61, 0x74, 0x53, 0x74, 0x61, 0x72, 0x74, 0x65, 0x64, 0x20, 0x3d, 0x20,
+  0x63, 0x6f, 0x6d, 0x70, 0x75, 0x74, 0x65, 0x64, 0x28, 0x28, 0x29, 0x20,
+  0x3d, 0x3e, 0x20, 0x73, 0x65, 0x73, 0x73, 0x69, 0x6f, 0x6e, 0x2e, 0x76,
+  0x61, 0x6c, 0x75, 0x65, 0x2e, 0x74, 0x72, 0x61, 0x6e, 0x73, 0x63, 0x72,
+  0x69, 0x70, 0x74, 0x2e, 0x6c, 0x65, 0x6e, 0x67, 0x74, 0x68, 0x20, 0x3e,
+  0x20, 0x30, 0x29, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e,
+  0x73, 0x74, 0x20, 0x74, 0x72, 0x61, 0x6e, 0x73, 0x63, 0x72, 0x69, 0x70,
+  0x74, 0x55, 0x70, 0x64, 0x61, 0x74, 0x65, 0x20, 0x3d, 0x20, 0x28, 0x74,
+  0x72, 0x61, 0x6e, 0x73, 0x63, 0x72, 0x69, 0x70, 0x74, 0x29, 0x20, 0x3d,
+  0x3e, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x73, 0x65,
+  0x73, 0x73, 0x69, 0x6f, 0x6e, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20,
+  0x3d, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x2e, 0x2e, 0x2e, 0x73, 0x65, 0x73, 0x73, 0x69, 0x6f, 0x6e, 0x2e, 0x76,
+  0x61, 0x6c, 0x75, 0x65, 0x2c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x74, 0x72, 0x61, 0x6e, 0x73, 0x63, 0x72, 0x69, 0x70, 0x74,
+  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20,
+  0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f, 0x20, 0x73,
+  0x69, 0x6d, 0x70, 0x6c, 0x65, 0x20, 0x74, 0x65, 0x6d, 0x70, 0x6c, 0x61,
+  0x74, 0x65, 0x20, 0x72, 0x65, 0x70, 0x6c, 0x61, 0x63, 0x65, 0x0a, 0x20,
+  0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x74, 0x65, 0x6d,
+  0x70, 0x6c, 0x61, 0x74, 0x65, 0x20, 0x3d, 0x20, 0x28, 0x73, 0x74, 0x72,
+  0x2c, 0x20, 0x65, 0x78, 0x74, 0x72, 0x61, 0x53, 0x65, 0x74, 0x74, 0x69,
+  0x6e, 0x67, 0x73, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x6c, 0x65, 0x74, 0x20, 0x73, 0x65, 0x74, 0x74,
+  0x69, 0x6e, 0x67, 0x73, 0x20, 0x3d, 0x20, 0x73, 0x65, 0x73, 0x73, 0x69,
+  0x6f, 0x6e, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x3b, 0x0a, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x65, 0x78, 0x74, 0x72,
+  0x61, 0x53, 0x65, 0x74, 0x74, 0x69, 0x6e, 0x67, 0x73, 0x29, 0x20, 0x7b,
+  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x73, 0x65, 0x74,
+  0x74, 0x69, 0x6e, 0x67, 0x73, 0x20, 0x3d, 0x20, 0x7b, 0x20, 0x2e, 0x2e,
+  0x2e, 0x73, 0x65, 0x74, 0x74, 0x69, 0x6e, 0x67, 0x73, 0x2c, 0x20, 0x2e,
+  0x2e, 0x2e, 0x65, 0x78, 0x74, 0x72, 0x61, 0x53, 0x65, 0x74, 0x74, 0x69,
+  0x6e, 0x67, 0x73, 0x20, 0x7d, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x74,
+  0x75, 0x72, 0x6e, 0x20, 0x53, 0x74, 0x72, 0x69, 0x6e, 0x67, 0x28, 0x73,
+  0x74, 0x72, 0x29, 0x2e, 0x72, 0x65, 0x70, 0x6c, 0x61, 0x63, 0x65, 0x41,
+  0x6c, 0x6c, 0x28, 0x2f, 0x5c, 0x7b, 0x5c, 0x7b, 0x28, 0x2e, 0x2a, 0x3f,
+  0x29, 0x5c, 0x7d, 0x5c, 0x7d, 0x2f, 0x67, 0x2c, 0x20, 0x28, 0x5f, 0x2c,
+  0x20, 0x6b, 0x65, 0x79, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x74, 0x65, 0x6d,
+  0x70, 0x6c, 0x61, 0x74, 0x65, 0x28, 0x73, 0x65, 0x74, 0x74, 0x69, 0x6e,
+  0x67, 0x73, 0x5b, 0x6b, 0x65, 0x79, 0x5d, 0x29, 0x29, 0x3b, 0x0a, 0x20,
+  0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f,
+  0x20, 0x73, 0x65, 0x6e, 0x64, 0x20, 0x6d, 0x65, 0x73, 0x73, 0x61, 0x67,
+  0x65, 0x20, 0x74, 0x6f, 0x20, 0x73, 0x65, 0x72, 0x76, 0x65, 0x72, 0x0a,
+  0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x63, 0x68,
+  0x61, 0x74, 0x20, 0x3d, 0x20, 0x61, 0x73, 0x79, 0x6e, 0x63, 0x20, 0x28,
+  0x6d, 0x73, 0x67, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x63, 0x6f, 0x6e, 0x74,
+  0x72, 0x6f, 0x6c, 0x6c, 0x65, 0x72, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65,
+  0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x63, 0x6f, 0x6e, 0x73, 0x6f, 0x6c, 0x65, 0x2e, 0x6c, 0x6f, 0x67, 0x28,
+  0x27, 0x61, 0x6c, 0x72, 0x65, 0x61, 0x64, 0x79, 0x20, 0x72, 0x75, 0x6e,
+  0x6e, 0x69, 0x6e, 0x67, 0x2e, 0x2e, 0x2e, 0x27, 0x29, 0x3b, 0x0a, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72,
+  0x6e, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x72, 0x6f, 0x6c,
+  0x6c, 0x65, 0x72, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x3d, 0x20,
+  0x6e, 0x65, 0x77, 0x20, 0x41, 0x62, 0x6f, 0x72, 0x74, 0x43, 0x6f, 0x6e,
+  0x74, 0x72, 0x6f, 0x6c, 0x6c, 0x65, 0x72, 0x28, 0x29, 0x3b, 0x0a, 0x0a,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x74, 0x72, 0x61, 0x6e, 0x73, 0x63,
+  0x72, 0x69, 0x70, 0x74, 0x55, 0x70, 0x64, 0x61, 0x74, 0x65, 0x28, 0x5b,
+  0x2e, 0x2e, 0x2e, 0x73, 0x65, 0x73, 0x73, 0x69, 0x6f, 0x6e, 0x2e, 0x76,
+  0x61, 0x6c, 0x75, 0x65, 0x2e, 0x74, 0x72, 0x61, 0x6e, 0x73, 0x63, 0x72,
+  0x69, 0x70, 0x74, 0x2c, 0x20, 0x5b, 0x22, 0x7b, 0x7b, 0x75, 0x73, 0x65,
+  0x72, 0x7d, 0x7d, 0x22, 0x2c, 0x20, 0x6d, 0x73, 0x67, 0x5d, 0x5d, 0x29,
+  0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73,
+  0x74, 0x20, 0x70, 0x72, 0x6f, 0x6d, 0x70, 0x74, 0x20, 0x3d, 0x20, 0x74,
+  0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x28, 0x73, 0x65, 0x73, 0x73,
+  0x69, 0x6f, 0x6e, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e, 0x74, 0x65,
+  0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x2c, 0x20, 0x7b, 0x0a, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6d, 0x65, 0x73, 0x73, 0x61, 0x67,
+  0x65, 0x3a, 0x20, 0x6d, 0x73, 0x67, 0x2c, 0x0a, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x68, 0x69, 0x73, 0x74, 0x6f, 0x72, 0x79, 0x3a,
+  0x20, 0x73, 0x65, 0x73, 0x73, 0x69, 0x6f, 0x6e, 0x2e, 0x76, 0x61, 0x6c,
+  0x75, 0x65, 0x2e, 0x74, 0x72, 0x61, 0x6e, 0x73, 0x63, 0x72, 0x69, 0x70,
+  0x74, 0x2e, 0x66, 0x6c, 0x61, 0x74, 0x4d, 0x61, 0x70, 0x28, 0x28, 0x5b,
+  0x6e, 0x61, 0x6d, 0x65, 0x2c, 0x20, 0x6d, 0x65, 0x73, 0x73, 0x61, 0x67,
+  0x65, 0x5d, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x74, 0x65, 0x6d, 0x70, 0x6c,
+  0x61, 0x74, 0x65, 0x28, 0x73, 0x65, 0x73, 0x73, 0x69, 0x6f, 0x6e, 0x2e,
+  0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e, 0x68, 0x69, 0x73, 0x74, 0x6f, 0x72,
+  0x79, 0x54, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x2c, 0x20, 0x7b,
+  0x6e, 0x61, 0x6d, 0x65, 0x2c, 0x20, 0x6d, 0x65, 0x73, 0x73, 0x61, 0x67,
+  0x65, 0x7d, 0x29, 0x29, 0x2e, 0x6a, 0x6f, 0x69, 0x6e, 0x28, 0x22, 0x5c,
+  0x6e, 0x22, 0x29, 0x2c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d,
+  0x29, 0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6c, 0x65,
+  0x74, 0x20, 0x63, 0x75, 0x72, 0x72, 0x65, 0x6e, 0x74, 0x4d, 0x65, 0x73,
+  0x73, 0x61, 0x67, 0x65, 0x20, 0x3d, 0x20, 0x27, 0x27, 0x3b, 0x0a, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x68,
+  0x69, 0x73, 0x74, 0x6f, 0x72, 0x79, 0x20, 0x3d, 0x20, 0x73, 0x65, 0x73,
+  0x73, 0x69, 0x6f, 0x6e, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e, 0x74,
+  0x72, 0x61, 0x6e, 0x73, 0x63, 0x72, 0x69, 0x70, 0x74, 0x0a, 0x0a, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x6c,
+  0x6c, 0x61, 0x6d, 0x61, 0x50, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x20, 0x3d,
+  0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2e,
+  0x2e, 0x2e, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2e, 0x76, 0x61, 0x6c,
+  0x75, 0x65, 0x2c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x73, 0x74, 0x6f, 0x70, 0x3a, 0x20, 0x5b, 0x22, 0x3c, 0x2f, 0x73, 0x3e,
+  0x22, 0x2c, 0x20, 0x74, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x28,
+  0x22, 0x7b, 0x7b, 0x63, 0x68, 0x61, 0x72, 0x7d, 0x7d, 0x3a, 0x22, 0x29,
+  0x2c, 0x20, 0x74, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x28, 0x22,
+  0x7b, 0x7b, 0x75, 0x73, 0x65, 0x72, 0x7d, 0x7d, 0x3a, 0x22, 0x29, 0x5d,
+  0x2c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x66, 0x6f, 0x72, 0x20, 0x61, 0x77, 0x61,
+  0x69, 0x74, 0x20, 0x28, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x63, 0x68,
+  0x75, 0x6e, 0x6b, 0x20, 0x6f, 0x66, 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61,
+  0x28, 0x70, 0x72, 0x6f, 0x6d, 0x70, 0x74, 0x2c, 0x20, 0x6c, 0x6c, 0x61,
+  0x6d, 0x61, 0x50, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2c, 0x20, 0x7b, 0x20,
+  0x63, 0x6f, 0x6e, 0x74, 0x72, 0x6f, 0x6c, 0x6c, 0x65, 0x72, 0x3a, 0x20,
+  0x63, 0x6f, 0x6e, 0x74, 0x72, 0x6f, 0x6c, 0x6c, 0x65, 0x72, 0x2e, 0x76,
+  0x61, 0x6c, 0x75, 0x65, 0x20, 0x7d, 0x29, 0x29, 0x20, 0x7b, 0x0a, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74,
+  0x20, 0x64, 0x61, 0x74, 0x61, 0x20, 0x3d, 0x20, 0x63, 0x68, 0x75, 0x6e,
+  0x6b, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x63, 0x75, 0x72, 0x72, 0x65, 0x6e, 0x74, 0x4d,
+  0x65, 0x73, 0x73, 0x61, 0x67, 0x65, 0x20, 0x2b, 0x3d, 0x20, 0x64, 0x61,
+  0x74, 0x61, 0x2e, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x3b, 0x0a,
+  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f, 0x20,
+  0x72, 0x65, 0x6d, 0x6f, 0x76, 0x65, 0x20, 0x6c, 0x65, 0x61, 0x64, 0x69,
+  0x6e, 0x67, 0x20, 0x77, 0x68, 0x69, 0x74, 0x65, 0x73, 0x70, 0x61, 0x63,
+  0x65, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x75,
+  0x72, 0x72, 0x65, 0x6e, 0x74, 0x4d, 0x65, 0x73, 0x73, 0x61, 0x67, 0x65,
+  0x20, 0x3d, 0x20, 0x63, 0x75, 0x72, 0x72, 0x65, 0x6e, 0x74, 0x4d, 0x65,
+  0x73, 0x73, 0x61, 0x67, 0x65, 0x2e, 0x72, 0x65, 0x70, 0x6c, 0x61, 0x63,
+  0x65, 0x28, 0x2f, 0x5e, 0x5c, 0x73, 0x2b, 0x2f, 0x2c, 0x20, 0x22, 0x22,
+  0x29, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x74,
+  0x72, 0x61, 0x6e, 0x73, 0x63, 0x72, 0x69, 0x70, 0x74, 0x55, 0x70, 0x64,
+  0x61, 0x74, 0x65, 0x28, 0x5b, 0x2e, 0x2e, 0x2e, 0x68, 0x69, 0x73, 0x74,
+  0x6f, 0x72, 0x79, 0x2c, 0x20, 0x5b, 0x22, 0x7b, 0x7b, 0x63, 0x68, 0x61,
+  0x72, 0x7d, 0x7d, 0x22, 0x2c, 0x20, 0x63, 0x75, 0x72, 0x72, 0x65, 0x6e,
+  0x74, 0x4d, 0x65, 0x73, 0x73, 0x61, 0x67, 0x65, 0x5d, 0x5d, 0x29, 0x0a,
+  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20,
+  0x28, 0x64, 0x61, 0x74, 0x61, 0x2e, 0x73, 0x74, 0x6f, 0x70, 0x29, 0x20,
+  0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x63, 0x6f, 0x6e, 0x73, 0x6f, 0x6c, 0x65, 0x2e, 0x6c, 0x6f, 0x67, 0x28,
+  0x22, 0x43, 0x6f, 0x6d, 0x70, 0x6c, 0x65, 0x74, 0x69, 0x6f, 0x6e, 0x20,
+  0x66, 0x69, 0x6e, 0x69, 0x73, 0x68, 0x65, 0x64, 0x3a, 0x20, 0x27, 0x22,
+  0x2c, 0x20, 0x63, 0x75, 0x72, 0x72, 0x65, 0x6e, 0x74, 0x4d, 0x65, 0x73,
+  0x73, 0x61, 0x67, 0x65, 0x2c, 0x20, 0x22, 0x27, 0x2c, 0x20, 0x73, 0x75,
+  0x6d, 0x6d, 0x61, 0x72, 0x79, 0x3a, 0x20, 0x22, 0x2c, 0x20, 0x64, 0x61,
+  0x74, 0x61, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x69, 0x66, 0x20, 0x28, 0x64, 0x61, 0x74, 0x61, 0x2e, 0x74, 0x69, 0x6d,
+  0x69, 0x6e, 0x67, 0x73, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x53,
+  0x74, 0x61, 0x74, 0x73, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x3d,
+  0x20, 0x64, 0x61, 0x74, 0x61, 0x2e, 0x74, 0x69, 0x6d, 0x69, 0x6e, 0x67,
+  0x73, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d,
+  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x72, 0x6f, 0x6c, 0x6c,
+  0x65, 0x72, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x3d, 0x20, 0x6e,
+  0x75, 0x6c, 0x6c, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a,
+  0x20, 0x20, 0x20, 0x20, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e,
+  0x20, 0x4d, 0x65, 0x73, 0x73, 0x61, 0x67, 0x65, 0x49, 0x6e, 0x70, 0x75,
+  0x74, 0x28, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x6d, 0x65, 0x73, 0x73, 0x61, 0x67,
+  0x65, 0x20, 0x3d, 0x20, 0x75, 0x73, 0x65, 0x53, 0x69, 0x67, 0x6e, 0x61,
+  0x6c, 0x28, 0x22, 0x22, 0x29, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x73, 0x74, 0x6f, 0x70, 0x20,
+  0x3d, 0x20, 0x28, 0x65, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x65, 0x2e, 0x70, 0x72, 0x65,
+  0x76, 0x65, 0x6e, 0x74, 0x44, 0x65, 0x66, 0x61, 0x75, 0x6c, 0x74, 0x28,
+  0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69,
+  0x66, 0x20, 0x28, 0x63, 0x6f, 0x6e, 0x74, 0x72, 0x6f, 0x6c, 0x6c, 0x65,
+  0x72, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x29, 0x20, 0x7b, 0x0a, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e,
+  0x74, 0x72, 0x6f, 0x6c, 0x6c, 0x65, 0x72, 0x2e, 0x76, 0x61, 0x6c, 0x75,
+  0x65, 0x2e, 0x61, 0x62, 0x6f, 0x72, 0x74, 0x28, 0x29, 0x3b, 0x0a, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e,
+  0x74, 0x72, 0x6f, 0x6c, 0x6c, 0x65, 0x72, 0x2e, 0x76, 0x61, 0x6c, 0x75,
+  0x65, 0x20, 0x3d, 0x20, 0x6e, 0x75, 0x6c, 0x6c, 0x3b, 0x0a, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63,
+  0x6f, 0x6e, 0x73, 0x74, 0x20, 0x72, 0x65, 0x73, 0x65, 0x74, 0x20, 0x3d,
+  0x20, 0x28, 0x65, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x73, 0x74, 0x6f, 0x70, 0x28, 0x65,
+  0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x74,
+  0x72, 0x61, 0x6e, 0x73, 0x63, 0x72, 0x69, 0x70, 0x74, 0x55, 0x70, 0x64,
+  0x61, 0x74, 0x65, 0x28, 0x5b, 0x5d, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x73, 0x75, 0x62, 0x6d, 0x69, 0x74,
+  0x20, 0x3d, 0x20, 0x28, 0x65, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x73, 0x74, 0x6f, 0x70,
+  0x28, 0x65, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x63, 0x68, 0x61, 0x74, 0x28, 0x6d, 0x65, 0x73, 0x73, 0x61, 0x67,
+  0x65, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x29, 0x3b, 0x0a, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6d, 0x65, 0x73, 0x73, 0x61, 0x67,
+  0x65, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x3d, 0x20, 0x22, 0x22,
+  0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x65,
+  0x6e, 0x74, 0x65, 0x72, 0x53, 0x75, 0x62, 0x6d, 0x69, 0x74, 0x73, 0x20,
+  0x3d, 0x20, 0x28, 0x65, 0x76, 0x65, 0x6e, 0x74, 0x29, 0x20, 0x3d, 0x3e,
+  0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69,
+  0x66, 0x20, 0x28, 0x65, 0x76, 0x65, 0x6e, 0x74, 0x2e, 0x77, 0x68, 0x69,
+  0x63, 0x68, 0x20, 0x3d, 0x3d, 0x3d, 0x20, 0x31, 0x33, 0x20, 0x26, 0x26,
+  0x20, 0x21, 0x65, 0x76, 0x65, 0x6e, 0x74, 0x2e, 0x73, 0x68, 0x69, 0x66,
+  0x74, 0x4b, 0x65, 0x79, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x73, 0x75, 0x62, 0x6d, 0x69, 0x74,
+  0x28, 0x65, 0x76, 0x65, 0x6e, 0x74, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65,
+  0x74, 0x75, 0x72, 0x6e, 0x20, 0x68, 0x74, 0x6d, 0x6c, 0x60, 0x0a, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x66, 0x6f, 0x72, 0x6d,
+  0x20, 0x6f, 0x6e, 0x73, 0x75, 0x62, 0x6d, 0x69, 0x74, 0x3d, 0x24, 0x7b,
+  0x73, 0x75, 0x62, 0x6d, 0x69, 0x74, 0x7d, 0x3e, 0x0a, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x64, 0x69, 0x76, 0x3e,
+  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x3c, 0x74, 0x65, 0x78, 0x74, 0x61, 0x72, 0x65, 0x61, 0x20, 0x74,
+  0x79, 0x70, 0x65, 0x3d, 0x22, 0x74, 0x65, 0x78, 0x74, 0x22, 0x20, 0x72,
+  0x6f, 0x77, 0x73, 0x3d, 0x32, 0x20, 0x6f, 0x6e, 0x6b, 0x65, 0x79, 0x70,
+  0x72, 0x65, 0x73, 0x73, 0x3d, 0x24, 0x7b, 0x65, 0x6e, 0x74, 0x65, 0x72,
+  0x53, 0x75, 0x62, 0x6d, 0x69, 0x74, 0x73, 0x7d, 0x20, 0x76, 0x61, 0x6c,
+  0x75, 0x65, 0x3d, 0x22, 0x24, 0x7b, 0x6d, 0x65, 0x73, 0x73, 0x61, 0x67,
+  0x65, 0x7d, 0x22, 0x20, 0x6f, 0x6e, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x3d,
+  0x24, 0x7b, 0x28, 0x65, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x6d, 0x65, 0x73,
+  0x73, 0x61, 0x67, 0x65, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x3d,
+  0x20, 0x65, 0x2e, 0x74, 0x61, 0x72, 0x67, 0x65, 0x74, 0x2e, 0x76, 0x61,
+  0x6c, 0x75, 0x65, 0x7d, 0x20, 0x70, 0x6c, 0x61, 0x63, 0x65, 0x68, 0x6f,
+  0x6c, 0x64, 0x65, 0x72, 0x3d, 0x22, 0x53, 0x61, 0x79, 0x20, 0x73, 0x6f,
+  0x6d, 0x65, 0x74, 0x68, 0x69, 0x6e, 0x67, 0x2e, 0x2e, 0x2e, 0x22, 0x2f,
+  0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x3c, 0x2f, 0x64, 0x69, 0x76, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x64, 0x69, 0x76, 0x20, 0x63, 0x6c,
+  0x61, 0x73, 0x73, 0x3d, 0x22, 0x72, 0x69, 0x67, 0x68, 0x74, 0x22, 0x3e,
+  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x3c, 0x62, 0x75, 0x74, 0x74, 0x6f, 0x6e, 0x20, 0x74, 0x79, 0x70,
+  0x65, 0x3d, 0x22, 0x73, 0x75, 0x62, 0x6d, 0x69, 0x74, 0x22, 0x20, 0x64,
+  0x69, 0x73, 0x61, 0x62, 0x6c, 0x65, 0x64, 0x3d, 0x24, 0x7b, 0x21, 0x67,
+  0x65, 0x6e, 0x65, 0x72, 0x61, 0x74, 0x69, 0x6e, 0x67, 0x2e, 0x76, 0x61,
+  0x6c, 0x75, 0x65, 0x7d, 0x20, 0x3e, 0x53, 0x65, 0x6e, 0x64, 0x3c, 0x2f,
+  0x62, 0x75, 0x74, 0x74, 0x6f, 0x6e, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x62, 0x75, 0x74,
+  0x74, 0x6f, 0x6e, 0x20, 0x6f, 0x6e, 0x63, 0x6c, 0x69, 0x63, 0x6b, 0x3d,
+  0x24, 0x7b, 0x73, 0x74, 0x6f, 0x70, 0x7d, 0x20, 0x64, 0x69, 0x73, 0x61,
+  0x62, 0x6c, 0x65, 0x64, 0x3d, 0x24, 0x7b, 0x67, 0x65, 0x6e, 0x65, 0x72,
+  0x61, 0x74, 0x69, 0x6e, 0x67, 0x7d, 0x3e, 0x53, 0x74, 0x6f, 0x70, 0x3c,
+  0x2f, 0x62, 0x75, 0x74, 0x74, 0x6f, 0x6e, 0x3e, 0x0a, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x62, 0x75,
+  0x74, 0x74, 0x6f, 0x6e, 0x20, 0x6f, 0x6e, 0x63, 0x6c, 0x69, 0x63, 0x6b,
+  0x3d, 0x24, 0x7b, 0x72, 0x65, 0x73, 0x65, 0x74, 0x7d, 0x3e, 0x52, 0x65,
+  0x73, 0x65, 0x74, 0x3c, 0x2f, 0x62, 0x75, 0x74, 0x74, 0x6f, 0x6e, 0x3e,
+  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c,
+  0x2f, 0x64, 0x69, 0x76, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x3c, 0x2f, 0x66, 0x6f, 0x72, 0x6d, 0x3e, 0x0a, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x60, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a,
+  0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x43,
+  0x68, 0x61, 0x74, 0x4c, 0x6f, 0x67, 0x20, 0x3d, 0x20, 0x28, 0x70, 0x72,
+  0x6f, 0x70, 0x73, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x6d, 0x65,
+  0x73, 0x73, 0x61, 0x67, 0x65, 0x73, 0x20, 0x3d, 0x20, 0x73, 0x65, 0x73,
+  0x73, 0x69, 0x6f, 0x6e, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e, 0x74,
+  0x72, 0x61, 0x6e, 0x73, 0x63, 0x72, 0x69, 0x70, 0x74, 0x3b, 0x0a, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x63,
+  0x6f, 0x6e, 0x74, 0x61, 0x69, 0x6e, 0x65, 0x72, 0x20, 0x3d, 0x20, 0x75,
+  0x73, 0x65, 0x52, 0x65, 0x66, 0x28, 0x6e, 0x75, 0x6c, 0x6c, 0x29, 0x0a,
+  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x75, 0x73, 0x65, 0x45, 0x66,
+  0x66, 0x65, 0x63, 0x74, 0x28, 0x28, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b,
+  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f, 0x20,
+  0x73, 0x63, 0x72, 0x6f, 0x6c, 0x6c, 0x20, 0x74, 0x6f, 0x20, 0x62, 0x6f,
+  0x74, 0x74, 0x6f, 0x6d, 0x20, 0x28, 0x69, 0x66, 0x20, 0x6e, 0x65, 0x65,
+  0x64, 0x65, 0x64, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x69, 0x66, 0x20, 0x28, 0x63, 0x6f, 0x6e, 0x74, 0x61, 0x69, 0x6e,
+  0x65, 0x72, 0x2e, 0x63, 0x75, 0x72, 0x72, 0x65, 0x6e, 0x74, 0x20, 0x26,
+  0x26, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x61, 0x69, 0x6e, 0x65, 0x72, 0x2e,
+  0x63, 0x75, 0x72, 0x72, 0x65, 0x6e, 0x74, 0x2e, 0x73, 0x63, 0x72, 0x6f,
+  0x6c, 0x6c, 0x48, 0x65, 0x69, 0x67, 0x68, 0x74, 0x20, 0x3c, 0x3d, 0x20,
+  0x63, 0x6f, 0x6e, 0x74, 0x61, 0x69, 0x6e, 0x65, 0x72, 0x2e, 0x63, 0x75,
+  0x72, 0x72, 0x65, 0x6e, 0x74, 0x2e, 0x73, 0x63, 0x72, 0x6f, 0x6c, 0x6c,
+  0x54, 0x6f, 0x70, 0x20, 0x2b, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x61, 0x69,
+  0x6e, 0x65, 0x72, 0x2e, 0x63, 0x75, 0x72, 0x72, 0x65, 0x6e, 0x74, 0x2e,
+  0x6f, 0x66, 0x66, 0x73, 0x65, 0x74, 0x48, 0x65, 0x69, 0x67, 0x68, 0x74,
+  0x20, 0x2b, 0x20, 0x33, 0x30, 0x30, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x74,
+  0x61, 0x69, 0x6e, 0x65, 0x72, 0x2e, 0x63, 0x75, 0x72, 0x72, 0x65, 0x6e,
+  0x74, 0x2e, 0x73, 0x63, 0x72, 0x6f, 0x6c, 0x6c, 0x54, 0x6f, 0x28, 0x30,
+  0x2c, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x61, 0x69, 0x6e, 0x65, 0x72, 0x2e,
+  0x63, 0x75, 0x72, 0x72, 0x65, 0x6e, 0x74, 0x2e, 0x73, 0x63, 0x72, 0x6f,
+  0x6c, 0x6c, 0x48, 0x65, 0x69, 0x67, 0x68, 0x74, 0x29, 0x0a, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x7d, 0x2c, 0x20, 0x5b, 0x6d, 0x65, 0x73, 0x73, 0x61, 0x67,
+  0x65, 0x73, 0x5d, 0x29, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x63, 0x68, 0x61, 0x74, 0x4c, 0x69,
+  0x6e, 0x65, 0x20, 0x3d, 0x20, 0x28, 0x5b, 0x75, 0x73, 0x65, 0x72, 0x2c,
+  0x20, 0x6d, 0x73, 0x67, 0x5d, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75,
+  0x72, 0x6e, 0x20, 0x68, 0x74, 0x6d, 0x6c, 0x60, 0x3c, 0x70, 0x20, 0x6b,
+  0x65, 0x79, 0x3d, 0x24, 0x7b, 0x6d, 0x73, 0x67, 0x7d, 0x3e, 0x3c, 0x73,
+  0x74, 0x72, 0x6f, 0x6e, 0x67, 0x3e, 0x24, 0x7b, 0x74, 0x65, 0x6d, 0x70,
+  0x6c, 0x61, 0x74, 0x65, 0x28, 0x75, 0x73, 0x65, 0x72, 0x29, 0x7d, 0x3a,
+  0x3c, 0x2f, 0x73, 0x74, 0x72, 0x6f, 0x6e, 0x67, 0x3e, 0x20, 0x3c, 0x24,
+  0x7b, 0x4d, 0x61, 0x72, 0x6b, 0x64, 0x6f, 0x77, 0x6e, 0x69, 0x73, 0x68,
+  0x7d, 0x20, 0x74, 0x65, 0x78, 0x74, 0x3d, 0x24, 0x7b, 0x74, 0x65, 0x6d,
+  0x70, 0x6c, 0x61, 0x74, 0x65, 0x28, 0x6d, 0x73, 0x67, 0x29, 0x7d, 0x20,
+  0x2f, 0x3e, 0x3c, 0x2f, 0x70, 0x3e, 0x60, 0x0a, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x7d, 0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x68, 0x74, 0x6d, 0x6c, 0x60,
+  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x73, 0x65,
+  0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x69, 0x64, 0x3d, 0x22, 0x63, 0x68,
+  0x61, 0x74, 0x22, 0x20, 0x72, 0x65, 0x66, 0x3d, 0x24, 0x7b, 0x63, 0x6f,
+  0x6e, 0x74, 0x61, 0x69, 0x6e, 0x65, 0x72, 0x7d, 0x3e, 0x0a, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x24, 0x7b, 0x6d, 0x65,
+  0x73, 0x73, 0x61, 0x67, 0x65, 0x73, 0x2e, 0x66, 0x6c, 0x61, 0x74, 0x4d,
+  0x61, 0x70, 0x28, 0x63, 0x68, 0x61, 0x74, 0x4c, 0x69, 0x6e, 0x65, 0x29,
+  0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x2f,
+  0x73, 0x65, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x3e, 0x60, 0x3b, 0x0a, 0x20,
+  0x20, 0x20, 0x20, 0x7d, 0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63,
+  0x6f, 0x6e, 0x73, 0x74, 0x20, 0x43, 0x6f, 0x6e, 0x66, 0x69, 0x67, 0x46,
+  0x6f, 0x72, 0x6d, 0x20, 0x3d, 0x20, 0x28, 0x70, 0x72, 0x6f, 0x70, 0x73,
+  0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x75, 0x70, 0x64, 0x61, 0x74,
+  0x65, 0x53, 0x65, 0x73, 0x73, 0x69, 0x6f, 0x6e, 0x20, 0x3d, 0x20, 0x28,
+  0x65, 0x6c, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x73, 0x65, 0x73, 0x73, 0x69,
+  0x6f, 0x6e, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x3d, 0x20, 0x7b,
+  0x20, 0x2e, 0x2e, 0x2e, 0x73, 0x65, 0x73, 0x73, 0x69, 0x6f, 0x6e, 0x2e,
+  0x76, 0x61, 0x6c, 0x75, 0x65, 0x2c, 0x20, 0x5b, 0x65, 0x6c, 0x2e, 0x74,
+  0x61, 0x72, 0x67, 0x65, 0x74, 0x2e, 0x6e, 0x61, 0x6d, 0x65, 0x5d, 0x3a,
+  0x20, 0x65, 0x6c, 0x2e, 0x74, 0x61, 0x72, 0x67, 0x65, 0x74, 0x2e, 0x76,
+  0x61, 0x6c, 0x75, 0x65, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x75, 0x70, 0x64, 0x61, 0x74,
+  0x65, 0x50, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x20, 0x3d, 0x20, 0x28, 0x65,
+  0x6c, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73,
+  0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x3d, 0x20, 0x7b, 0x20, 0x2e,
+  0x2e, 0x2e, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2e, 0x76, 0x61, 0x6c,
+  0x75, 0x65, 0x2c, 0x20, 0x5b, 0x65, 0x6c, 0x2e, 0x74, 0x61, 0x72, 0x67,
+  0x65, 0x74, 0x2e, 0x6e, 0x61, 0x6d, 0x65, 0x5d, 0x3a, 0x20, 0x65, 0x6c,
+  0x2e, 0x74, 0x61, 0x72, 0x67, 0x65, 0x74, 0x2e, 0x76, 0x61, 0x6c, 0x75,
+  0x65, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f,
+  0x6e, 0x73, 0x74, 0x20, 0x75, 0x70, 0x64, 0x61, 0x74, 0x65, 0x50, 0x61,
+  0x72, 0x61, 0x6d, 0x73, 0x46, 0x6c, 0x6f, 0x61, 0x74, 0x20, 0x3d, 0x20,
+  0x28, 0x65, 0x6c, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x70, 0x61, 0x72, 0x61,
+  0x6d, 0x73, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x3d, 0x20, 0x7b,
+  0x20, 0x2e, 0x2e, 0x2e, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2e, 0x76,
+  0x61, 0x6c, 0x75, 0x65, 0x2c, 0x20, 0x5b, 0x65, 0x6c, 0x2e, 0x74, 0x61,
+  0x72, 0x67, 0x65, 0x74, 0x2e, 0x6e, 0x61, 0x6d, 0x65, 0x5d, 0x3a, 0x20,
+  0x70, 0x61, 0x72, 0x73, 0x65, 0x46, 0x6c, 0x6f, 0x61, 0x74, 0x28, 0x65,
+  0x6c, 0x2e, 0x74, 0x61, 0x72, 0x67, 0x65, 0x74, 0x2e, 0x76, 0x61, 0x6c,
+  0x75, 0x65, 0x29, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x68, 0x74, 0x6d, 0x6c,
+  0x60, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x66,
+  0x6f, 0x72, 0x6d, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x3c, 0x66, 0x69, 0x65, 0x6c, 0x64, 0x73, 0x65, 0x74,
+  0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x3c, 0x64, 0x69, 0x76, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x6c,
+  0x61, 0x62, 0x65, 0x6c, 0x20, 0x66, 0x6f, 0x72, 0x3d, 0x22, 0x70, 0x72,
+  0x6f, 0x6d, 0x70, 0x74, 0x22, 0x3e, 0x50, 0x72, 0x6f, 0x6d, 0x70, 0x74,
+  0x3c, 0x2f, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x3e, 0x0a, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c,
+  0x74, 0x65, 0x78, 0x74, 0x61, 0x72, 0x65, 0x61, 0x20, 0x74, 0x79, 0x70,
+  0x65, 0x3d, 0x22, 0x74, 0x65, 0x78, 0x74, 0x22, 0x20, 0x6e, 0x61, 0x6d,
+  0x65, 0x3d, 0x22, 0x70, 0x72, 0x6f, 0x6d, 0x70, 0x74, 0x22, 0x20, 0x76,
+  0x61, 0x6c, 0x75, 0x65, 0x3d, 0x22, 0x24, 0x7b, 0x73, 0x65, 0x73, 0x73,
+  0x69, 0x6f, 0x6e, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e, 0x70, 0x72,
+  0x6f, 0x6d, 0x70, 0x74, 0x7d, 0x22, 0x20, 0x72, 0x6f, 0x77, 0x73, 0x3d,
+  0x34, 0x20, 0x6f, 0x6e, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x3d, 0x24, 0x7b,
+  0x75, 0x70, 0x64, 0x61, 0x74, 0x65, 0x53, 0x65, 0x73, 0x73, 0x69, 0x6f,
+  0x6e, 0x7d, 0x2f, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x2f, 0x64, 0x69, 0x76, 0x3e, 0x0a,
+  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x3c, 0x64, 0x69, 0x76, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x6c, 0x61,
+  0x62, 0x65, 0x6c, 0x20, 0x66, 0x6f, 0x72, 0x3d, 0x22, 0x75, 0x73, 0x65,
+  0x72, 0x22, 0x3e, 0x55, 0x73, 0x65, 0x72, 0x20, 0x6e, 0x61, 0x6d, 0x65,
+  0x3c, 0x2f, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x3e, 0x0a, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c,
+  0x69, 0x6e, 0x70, 0x75, 0x74, 0x20, 0x74, 0x79, 0x70, 0x65, 0x3d, 0x22,
+  0x74, 0x65, 0x78, 0x74, 0x22, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x3d, 0x22,
+  0x75, 0x73, 0x65, 0x72, 0x22, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x3d,
+  0x22, 0x24, 0x7b, 0x73, 0x65, 0x73, 0x73, 0x69, 0x6f, 0x6e, 0x2e, 0x76,
+  0x61, 0x6c, 0x75, 0x65, 0x2e, 0x75, 0x73, 0x65, 0x72, 0x7d, 0x22, 0x20,
+  0x6f, 0x6e, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x3d, 0x24, 0x7b, 0x75, 0x70,
+  0x64, 0x61, 0x74, 0x65, 0x53, 0x65, 0x73, 0x73, 0x69, 0x6f, 0x6e, 0x7d,
+  0x20, 0x2f, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x3c, 0x2f, 0x64, 0x69, 0x76, 0x3e, 0x0a, 0x0a,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x3c, 0x64, 0x69, 0x76, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x6c, 0x61, 0x62,
+  0x65, 0x6c, 0x20, 0x66, 0x6f, 0x72, 0x3d, 0x22, 0x62, 0x6f, 0x74, 0x22,
+  0x3e, 0x42, 0x6f, 0x74, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x3c, 0x2f, 0x6c,
+  0x61, 0x62, 0x65, 0x6c, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x69, 0x6e, 0x70,
+  0x75, 0x74, 0x20, 0x74, 0x79, 0x70, 0x65, 0x3d, 0x22, 0x74, 0x65, 0x78,
+  0x74, 0x22, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x3d, 0x22, 0x63, 0x68, 0x61,
+  0x72, 0x22, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x3d, 0x22, 0x24, 0x7b,
+  0x73, 0x65, 0x73, 0x73, 0x69, 0x6f, 0x6e, 0x2e, 0x76, 0x61, 0x6c, 0x75,
+  0x65, 0x2e, 0x63, 0x68, 0x61, 0x72, 0x7d, 0x22, 0x20, 0x6f, 0x6e, 0x69,
+  0x6e, 0x70, 0x75, 0x74, 0x3d, 0x24, 0x7b, 0x75, 0x70, 0x64, 0x61, 0x74,
+  0x65, 0x53, 0x65, 0x73, 0x73, 0x69, 0x6f, 0x6e, 0x7d, 0x20, 0x2f, 0x3e,
+  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x3c, 0x2f, 0x64, 0x69, 0x76, 0x3e, 0x0a, 0x0a, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x64, 0x69,
+  0x76, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x20,
+  0x66, 0x6f, 0x72, 0x3d, 0x22, 0x74, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74,
+  0x65, 0x22, 0x3e, 0x50, 0x72, 0x6f, 0x6d, 0x70, 0x74, 0x20, 0x74, 0x65,
+  0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x3c, 0x2f, 0x6c, 0x61, 0x62, 0x65,
+  0x6c, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x74, 0x65, 0x78, 0x74, 0x61, 0x72,
+  0x65, 0x61, 0x20, 0x69, 0x64, 0x3d, 0x22, 0x74, 0x65, 0x6d, 0x70, 0x6c,
+  0x61, 0x74, 0x65, 0x22, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x3d, 0x22, 0x74,
+  0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x22, 0x20, 0x76, 0x61, 0x6c,
+  0x75, 0x65, 0x3d, 0x22, 0x24, 0x7b, 0x73, 0x65, 0x73, 0x73, 0x69, 0x6f,
+  0x6e, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e, 0x74, 0x65, 0x6d, 0x70,
+  0x6c, 0x61, 0x74, 0x65, 0x7d, 0x22, 0x20, 0x72, 0x6f, 0x77, 0x73, 0x3d,
+  0x34, 0x20, 0x6f, 0x6e, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x3d, 0x24, 0x7b,
+  0x75, 0x70, 0x64, 0x61, 0x74, 0x65, 0x53, 0x65, 0x73, 0x73, 0x69, 0x6f,
+  0x6e, 0x7d, 0x2f, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x2f, 0x64, 0x69, 0x76, 0x3e, 0x0a,
+  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x3c, 0x64, 0x69, 0x76, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x6c, 0x61,
+  0x62, 0x65, 0x6c, 0x20, 0x66, 0x6f, 0x72, 0x3d, 0x22, 0x74, 0x65, 0x6d,
+  0x70, 0x6c, 0x61, 0x74, 0x65, 0x22, 0x3e, 0x43, 0x68, 0x61, 0x74, 0x20,
+  0x68, 0x69, 0x73, 0x74, 0x6f, 0x72, 0x79, 0x20, 0x74, 0x65, 0x6d, 0x70,
+  0x6c, 0x61, 0x74, 0x65, 0x3c, 0x2f, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x3e,
+  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x3c, 0x74, 0x65, 0x78, 0x74, 0x61, 0x72, 0x65, 0x61,
+  0x20, 0x69, 0x64, 0x3d, 0x22, 0x74, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74,
+  0x65, 0x22, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x3d, 0x22, 0x68, 0x69, 0x73,
+  0x74, 0x6f, 0x72, 0x79, 0x54, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65,
+  0x22, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x3d, 0x22, 0x24, 0x7b, 0x73,
+  0x65, 0x73, 0x73, 0x69, 0x6f, 0x6e, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65,
+  0x2e, 0x68, 0x69, 0x73, 0x74, 0x6f, 0x72, 0x79, 0x54, 0x65, 0x6d, 0x70,
+  0x6c, 0x61, 0x74, 0x65, 0x7d, 0x22, 0x20, 0x72, 0x6f, 0x77, 0x73, 0x3d,
+  0x31, 0x20, 0x6f, 0x6e, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x3d, 0x24, 0x7b,
+  0x75, 0x70, 0x64, 0x61, 0x74, 0x65, 0x53, 0x65, 0x73, 0x73, 0x69, 0x6f,
+  0x6e, 0x7d, 0x2f, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x2f, 0x64, 0x69, 0x76, 0x3e, 0x0a,
+  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x3c, 0x64, 0x69, 0x76, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x6c, 0x61,
+  0x62, 0x65, 0x6c, 0x20, 0x66, 0x6f, 0x72, 0x3d, 0x22, 0x74, 0x65, 0x6d,
+  0x70, 0x65, 0x72, 0x61, 0x74, 0x75, 0x72, 0x65, 0x22, 0x3e, 0x54, 0x65,
+  0x6d, 0x70, 0x65, 0x72, 0x61, 0x74, 0x75, 0x72, 0x65, 0x3c, 0x2f, 0x6c,
+  0x61, 0x62, 0x65, 0x6c, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x69, 0x6e, 0x70,
+  0x75, 0x74, 0x20, 0x74, 0x79, 0x70, 0x65, 0x3d, 0x22, 0x72, 0x61, 0x6e,
+  0x67, 0x65, 0x22, 0x20, 0x69, 0x64, 0x3d, 0x22, 0x74, 0x65, 0x6d, 0x70,
+  0x65, 0x72, 0x61, 0x74, 0x75, 0x72, 0x65, 0x22, 0x20, 0x6d, 0x69, 0x6e,
+  0x3d, 0x22, 0x30, 0x2e, 0x30, 0x22, 0x20, 0x6d, 0x61, 0x78, 0x3d, 0x22,
+  0x31, 0x2e, 0x30, 0x22, 0x20, 0x73, 0x74, 0x65, 0x70, 0x3d, 0x22, 0x30,
+  0x2e, 0x30, 0x31, 0x22, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x3d, 0x22, 0x74,
+  0x65, 0x6d, 0x70, 0x65, 0x72, 0x61, 0x74, 0x75, 0x72, 0x65, 0x22, 0x20,
+  0x76, 0x61, 0x6c, 0x75, 0x65, 0x3d, 0x22, 0x24, 0x7b, 0x70, 0x61, 0x72,
+  0x61, 0x6d, 0x73, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e, 0x74, 0x65,
+  0x6d, 0x70, 0x65, 0x72, 0x61, 0x74, 0x75, 0x72, 0x65, 0x7d, 0x22, 0x20,
+  0x6f, 0x6e, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x3d, 0x24, 0x7b, 0x75, 0x70,
+  0x64, 0x61, 0x74, 0x65, 0x50, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x46, 0x6c,
+  0x6f, 0x61, 0x74, 0x7d, 0x20, 0x2f, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x73,
+  0x70, 0x61, 0x6e, 0x3e, 0x24, 0x7b, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73,
+  0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e, 0x74, 0x65, 0x6d, 0x70, 0x65,
+  0x72, 0x61, 0x74, 0x75, 0x72, 0x65, 0x7d, 0x3c, 0x2f, 0x73, 0x70, 0x61,
+  0x6e, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x3c, 0x2f, 0x64, 0x69, 0x76, 0x3e, 0x0a, 0x0a, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c,
+  0x64, 0x69, 0x76, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x6c, 0x61, 0x62, 0x65,
+  0x6c, 0x20, 0x66, 0x6f, 0x72, 0x3d, 0x22, 0x6e, 0x50, 0x72, 0x65, 0x64,
+  0x69, 0x63, 0x74, 0x22, 0x3e, 0x50, 0x72, 0x65, 0x64, 0x69, 0x63, 0x74,
+  0x69, 0x6f, 0x6e, 0x73, 0x3c, 0x2f, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x3e,
+  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x3c, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x20, 0x74, 0x79,
+  0x70, 0x65, 0x3d, 0x22, 0x72, 0x61, 0x6e, 0x67, 0x65, 0x22, 0x20, 0x69,
+  0x64, 0x3d, 0x22, 0x6e, 0x50, 0x72, 0x65, 0x64, 0x69, 0x63, 0x74, 0x22,
+  0x20, 0x6d, 0x69, 0x6e, 0x3d, 0x22, 0x31, 0x22, 0x20, 0x6d, 0x61, 0x78,
+  0x3d, 0x22, 0x32, 0x30, 0x34, 0x38, 0x22, 0x20, 0x73, 0x74, 0x65, 0x70,
+  0x3d, 0x22, 0x31, 0x22, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x3d, 0x22, 0x6e,
+  0x5f, 0x70, 0x72, 0x65, 0x64, 0x69, 0x63, 0x74, 0x22, 0x20, 0x76, 0x61,
+  0x6c, 0x75, 0x65, 0x3d, 0x22, 0x24, 0x7b, 0x70, 0x61, 0x72, 0x61, 0x6d,
+  0x73, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e, 0x6e, 0x5f, 0x70, 0x72,
+  0x65, 0x64, 0x69, 0x63, 0x74, 0x7d, 0x22, 0x20, 0x6f, 0x6e, 0x69, 0x6e,
+  0x70, 0x75, 0x74, 0x3d, 0x24, 0x7b, 0x75, 0x70, 0x64, 0x61, 0x74, 0x65,
+  0x50, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x46, 0x6c, 0x6f, 0x61, 0x74, 0x7d,
+  0x20, 0x2f, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x73, 0x70, 0x61, 0x6e, 0x3e,
+  0x24, 0x7b, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2e, 0x76, 0x61, 0x6c,
+  0x75, 0x65, 0x2e, 0x6e, 0x5f, 0x70, 0x72, 0x65, 0x64, 0x69, 0x63, 0x74,
+  0x7d, 0x3c, 0x2f, 0x73, 0x70, 0x61, 0x6e, 0x3e, 0x0a, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x2f, 0x64,
+  0x69, 0x76, 0x3e, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x64, 0x69, 0x76, 0x3e, 0x0a, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x3c, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x20, 0x66, 0x6f, 0x72, 0x3d,
+  0x22, 0x72, 0x65, 0x70, 0x65, 0x61, 0x74, 0x5f, 0x70, 0x65, 0x6e, 0x61,
+  0x6c, 0x74, 0x79, 0x22, 0x3e, 0x50, 0x65, 0x6e, 0x61, 0x6c, 0x69, 0x7a,
+  0x65, 0x20, 0x72, 0x65, 0x70, 0x65, 0x61, 0x74, 0x20, 0x73, 0x65, 0x71,
+  0x75, 0x65, 0x6e, 0x63, 0x65, 0x3c, 0x2f, 0x6c, 0x61, 0x62, 0x65, 0x6c,
+  0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x3c, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x20, 0x74,
+  0x79, 0x70, 0x65, 0x3d, 0x22, 0x72, 0x61, 0x6e, 0x67, 0x65, 0x22, 0x20,
+  0x69, 0x64, 0x3d, 0x22, 0x72, 0x65, 0x70, 0x65, 0x61, 0x74, 0x5f, 0x70,
+  0x65, 0x6e, 0x61, 0x6c, 0x74, 0x79, 0x22, 0x20, 0x6d, 0x69, 0x6e, 0x3d,
+  0x22, 0x30, 0x2e, 0x30, 0x22, 0x20, 0x6d, 0x61, 0x78, 0x3d, 0x22, 0x32,
+  0x2e, 0x30, 0x22, 0x20, 0x73, 0x74, 0x65, 0x70, 0x3d, 0x22, 0x30, 0x2e,
+  0x30, 0x31, 0x22, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x3d, 0x22, 0x72, 0x65,
+  0x70, 0x65, 0x61, 0x74, 0x5f, 0x70, 0x65, 0x6e, 0x61, 0x6c, 0x74, 0x79,
+  0x22, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x3d, 0x22, 0x24, 0x7b, 0x70,
+  0x61, 0x72, 0x61, 0x6d, 0x73, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e,
+  0x72, 0x65, 0x70, 0x65, 0x61, 0x74, 0x5f, 0x70, 0x65, 0x6e, 0x61, 0x6c,
+  0x74, 0x79, 0x7d, 0x22, 0x20, 0x6f, 0x6e, 0x69, 0x6e, 0x70, 0x75, 0x74,
+  0x3d, 0x24, 0x7b, 0x75, 0x70, 0x64, 0x61, 0x74, 0x65, 0x50, 0x61, 0x72,
+  0x61, 0x6d, 0x73, 0x46, 0x6c, 0x6f, 0x61, 0x74, 0x7d, 0x20, 0x2f, 0x3e,
+  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x3c, 0x73, 0x70, 0x61, 0x6e, 0x3e, 0x24, 0x7b, 0x70,
+  0x61, 0x72, 0x61, 0x6d, 0x73, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e,
+  0x72, 0x65, 0x70, 0x65, 0x61, 0x74, 0x5f, 0x70, 0x65, 0x6e, 0x61, 0x6c,
+  0x74, 0x79, 0x7d, 0x3c, 0x2f, 0x73, 0x70, 0x61, 0x6e, 0x3e, 0x0a, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c,
+  0x2f, 0x64, 0x69, 0x76, 0x3e, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x64, 0x69, 0x76, 0x3e,
+  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x3c, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x20, 0x66, 0x6f,
+  0x72, 0x3d, 0x22, 0x72, 0x65, 0x70, 0x65, 0x61, 0x74, 0x5f, 0x6c, 0x61,
+  0x73, 0x74, 0x5f, 0x6e, 0x22, 0x3e, 0x43, 0x6f, 0x6e, 0x73, 0x69, 0x64,
+  0x65, 0x72, 0x20, 0x4e, 0x20, 0x74, 0x6f, 0x6b, 0x65, 0x6e, 0x73, 0x20,
+  0x66, 0x6f, 0x72, 0x20, 0x70, 0x65, 0x6e, 0x61, 0x6c, 0x69, 0x7a, 0x65,
+  0x3c, 0x2f, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x3e, 0x0a, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c,
+  0x69, 0x6e, 0x70, 0x75, 0x74, 0x20, 0x74, 0x79, 0x70, 0x65, 0x3d, 0x22,
+  0x72, 0x61, 0x6e, 0x67, 0x65, 0x22, 0x20, 0x69, 0x64, 0x3d, 0x22, 0x72,
+  0x65, 0x70, 0x65, 0x61, 0x74, 0x5f, 0x6c, 0x61, 0x73, 0x74, 0x5f, 0x6e,
+  0x22, 0x20, 0x6d, 0x69, 0x6e, 0x3d, 0x22, 0x30, 0x2e, 0x30, 0x22, 0x20,
+  0x6d, 0x61, 0x78, 0x3d, 0x22, 0x32, 0x30, 0x34, 0x38, 0x22, 0x20, 0x6e,
+  0x61, 0x6d, 0x65, 0x3d, 0x22, 0x72, 0x65, 0x70, 0x65, 0x61, 0x74, 0x5f,
+  0x6c, 0x61, 0x73, 0x74, 0x5f, 0x6e, 0x22, 0x20, 0x76, 0x61, 0x6c, 0x75,
+  0x65, 0x3d, 0x22, 0x24, 0x7b, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2e,
+  0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e, 0x72, 0x65, 0x70, 0x65, 0x61, 0x74,
+  0x5f, 0x6c, 0x61, 0x73, 0x74, 0x5f, 0x6e, 0x7d, 0x22, 0x20, 0x6f, 0x6e,
+  0x69, 0x6e, 0x70, 0x75, 0x74, 0x3d, 0x24, 0x7b, 0x75, 0x70, 0x64, 0x61,
+  0x74, 0x65, 0x50, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x46, 0x6c, 0x6f, 0x61,
+  0x74, 0x7d, 0x20, 0x2f, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x73, 0x70, 0x61,
+  0x6e, 0x3e, 0x24, 0x7b, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2e, 0x76,
+  0x61, 0x6c, 0x75, 0x65, 0x2e, 0x72, 0x65, 0x70, 0x65, 0x61, 0x74, 0x5f,
+  0x6c, 0x61, 0x73, 0x74, 0x5f, 0x6e, 0x7d, 0x3c, 0x2f, 0x73, 0x70, 0x61,
+  0x6e, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x3c, 0x2f, 0x64, 0x69, 0x76, 0x3e, 0x0a, 0x0a, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x2f, 0x66,
+  0x69, 0x65, 0x6c, 0x64, 0x73, 0x65, 0x74, 0x3e, 0x0a, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x2f, 0x66, 0x6f, 0x72, 0x6d, 0x3e,
+  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x60, 0x0a, 0x20, 0x20, 0x20,
+  0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f, 0x20, 0x70, 0x6f,
+  0x6f, 0x72, 0x20, 0x6d, 0x61, 0x6e, 0x73, 0x20, 0x6d, 0x61, 0x72, 0x6b,
+  0x64, 0x6f, 0x77, 0x6e, 0x20, 0x72, 0x65, 0x70, 0x6c, 0x61, 0x63, 0x65,
+  0x6d, 0x65, 0x6e, 0x74, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e,
+  0x73, 0x74, 0x20, 0x4d, 0x61, 0x72, 0x6b, 0x64, 0x6f, 0x77, 0x6e, 0x69,
+  0x73, 0x68, 0x20, 0x3d, 0x20, 0x28, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73,
+  0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x6d, 0x64, 0x20, 0x3d, 0x20,
+  0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2e, 0x74, 0x65, 0x78, 0x74, 0x0a,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2e, 0x72, 0x65, 0x70,
+  0x6c, 0x61, 0x63, 0x65, 0x28, 0x2f, 0x5e, 0x23, 0x7b, 0x31, 0x2c, 0x36,
+  0x7d, 0x20, 0x28, 0x2e, 0x2a, 0x29, 0x24, 0x2f, 0x67, 0x69, 0x6d, 0x2c,
+  0x20, 0x27, 0x3c, 0x68, 0x33, 0x3e, 0x24, 0x31, 0x3c, 0x2f, 0x68, 0x33,
+  0x3e, 0x27, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x2e, 0x72, 0x65, 0x70, 0x6c, 0x61, 0x63, 0x65, 0x28, 0x2f, 0x5c, 0x2a,
+  0x5c, 0x2a, 0x28, 0x2e, 0x2a, 0x3f, 0x29, 0x5c, 0x2a, 0x5c, 0x2a, 0x2f,
+  0x67, 0x2c, 0x20, 0x27, 0x3c, 0x73, 0x74, 0x72, 0x6f, 0x6e, 0x67, 0x3e,
+  0x24, 0x31, 0x3c, 0x2f, 0x73, 0x74, 0x72, 0x6f, 0x6e, 0x67, 0x3e, 0x27,
+  0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2e, 0x72,
+  0x65, 0x70, 0x6c, 0x61, 0x63, 0x65, 0x28, 0x2f, 0x5f, 0x5f, 0x28, 0x2e,
+  0x2a, 0x3f, 0x29, 0x5f, 0x5f, 0x2f, 0x67, 0x2c, 0x20, 0x27, 0x3c, 0x73,
+  0x74, 0x72, 0x6f, 0x6e, 0x67, 0x3e, 0x24, 0x31, 0x3c, 0x2f, 0x73, 0x74,
+  0x72, 0x6f, 0x6e, 0x67, 0x3e, 0x27, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x2e, 0x72, 0x65, 0x70, 0x6c, 0x61, 0x63, 0x65,
+  0x28, 0x2f, 0x5c, 0x2a, 0x28, 0x2e, 0x2a, 0x3f, 0x29, 0x5c, 0x2a, 0x2f,
+  0x67, 0x2c, 0x20, 0x27, 0x3c, 0x65, 0x6d, 0x3e, 0x24, 0x31, 0x3c, 0x2f,
+  0x65, 0x6d, 0x3e, 0x27, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x2e, 0x72, 0x65, 0x70, 0x6c, 0x61, 0x63, 0x65, 0x28, 0x2f,
+  0x5f, 0x28, 0x2e, 0x2a, 0x3f, 0x29, 0x5f, 0x2f, 0x67, 0x2c, 0x20, 0x27,
+  0x3c, 0x65, 0x6d, 0x3e, 0x24, 0x31, 0x3c, 0x2f, 0x65, 0x6d, 0x3e, 0x27,
+  0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2e, 0x72,
+  0x65, 0x70, 0x6c, 0x61, 0x63, 0x65, 0x28, 0x2f, 0x60, 0x60, 0x60, 0x2e,
+  0x2a, 0x3f, 0x5c, 0x6e, 0x28, 0x5b, 0x5c, 0x73, 0x5c, 0x53, 0x5d, 0x2a,
+  0x3f, 0x29, 0x60, 0x60, 0x60, 0x2f, 0x67, 0x2c, 0x20, 0x27, 0x3c, 0x70,
+  0x72, 0x65, 0x3e, 0x3c, 0x63, 0x6f, 0x64, 0x65, 0x3e, 0x24, 0x31, 0x3c,
+  0x2f, 0x63, 0x6f, 0x64, 0x65, 0x3e, 0x3c, 0x2f, 0x70, 0x72, 0x65, 0x3e,
+  0x27, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2e,
+  0x72, 0x65, 0x70, 0x6c, 0x61, 0x63, 0x65, 0x28, 0x2f, 0x60, 0x28, 0x2e,
+  0x2a, 0x3f, 0x29, 0x60, 0x2f, 0x67, 0x2c, 0x20, 0x27, 0x3c, 0x63, 0x6f,
+  0x64, 0x65, 0x3e, 0x24, 0x31, 0x3c, 0x2f, 0x63, 0x6f, 0x64, 0x65, 0x3e,
+  0x27, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2e,
+  0x72, 0x65, 0x70, 0x6c, 0x61, 0x63, 0x65, 0x28, 0x2f, 0x5c, 0x6e, 0x2f,
+  0x67, 0x69, 0x6d, 0x2c, 0x20, 0x27, 0x3c, 0x62, 0x72, 0x20, 0x2f, 0x3e,
+  0x27, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65,
+  0x74, 0x75, 0x72, 0x6e, 0x20, 0x68, 0x74, 0x6d, 0x6c, 0x60, 0x3c, 0x73,
+  0x70, 0x61, 0x6e, 0x20, 0x64, 0x61, 0x6e, 0x67, 0x65, 0x72, 0x6f, 0x75,
+  0x73, 0x6c, 0x79, 0x53, 0x65, 0x74, 0x49, 0x6e, 0x6e, 0x65, 0x72, 0x48,
+  0x54, 0x4d, 0x4c, 0x3d, 0x24, 0x7b, 0x7b, 0x20, 0x5f, 0x5f, 0x68, 0x74,
+  0x6d, 0x6c, 0x3a, 0x20, 0x6d, 0x64, 0x20, 0x7d, 0x7d, 0x20, 0x2f, 0x3e,
+  0x60, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x3b, 0x0a, 0x0a, 0x20,
+  0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x4d, 0x6f, 0x64,
+  0x65, 0x6c, 0x47, 0x65, 0x6e, 0x65, 0x72, 0x61, 0x74, 0x69, 0x6f, 0x6e,
+  0x49, 0x6e, 0x66, 0x6f, 0x20, 0x3d, 0x20, 0x28, 0x70, 0x61, 0x72, 0x61,
+  0x6d, 0x73, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x21, 0x6c, 0x6c, 0x61, 0x6d,
+  0x61, 0x53, 0x74, 0x61, 0x74, 0x73, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65,
+  0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x68, 0x74, 0x6d, 0x6c, 0x60,
+  0x3c, 0x73, 0x70, 0x61, 0x6e, 0x2f, 0x3e, 0x60, 0x0a, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72,
+  0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x68, 0x74, 0x6d, 0x6c, 0x60, 0x0a,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x73, 0x70, 0x61,
+  0x6e, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x24, 0x7b, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x53, 0x74, 0x61, 0x74,
+  0x73, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e, 0x70, 0x72, 0x65, 0x64,
+  0x69, 0x63, 0x74, 0x65, 0x64, 0x5f, 0x70, 0x65, 0x72, 0x5f, 0x74, 0x6f,
+  0x6b, 0x65, 0x6e, 0x5f, 0x6d, 0x73, 0x2e, 0x74, 0x6f, 0x46, 0x69, 0x78,
+  0x65, 0x64, 0x28, 0x29, 0x7d, 0x6d, 0x73, 0x20, 0x70, 0x65, 0x72, 0x20,
+  0x74, 0x6f, 0x6b, 0x65, 0x6e, 0x2c, 0x20, 0x24, 0x7b, 0x6c, 0x6c, 0x61,
+  0x6d, 0x61, 0x53, 0x74, 0x61, 0x74, 0x73, 0x2e, 0x76, 0x61, 0x6c, 0x75,
+  0x65, 0x2e, 0x70, 0x72, 0x65, 0x64, 0x69, 0x63, 0x74, 0x65, 0x64, 0x5f,
+  0x70, 0x65, 0x72, 0x5f, 0x73, 0x65, 0x63, 0x6f, 0x6e, 0x64, 0x2e, 0x74,
+  0x6f, 0x46, 0x69, 0x78, 0x65, 0x64, 0x28, 0x32, 0x29, 0x7d, 0x20, 0x74,
+  0x6f, 0x6b, 0x65, 0x6e, 0x73, 0x20, 0x70, 0x65, 0x72, 0x20, 0x73, 0x65,
+  0x63, 0x6f, 0x6e, 0x64, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x3c, 0x2f, 0x73, 0x70, 0x61, 0x6e, 0x3e, 0x0a, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x60, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a,
+  0x20, 0x20, 0x20, 0x20, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e,
+  0x20, 0x41, 0x70, 0x70, 0x28, 0x70, 0x72, 0x6f, 0x70, 0x73, 0x29, 0x20,
+  0x7b, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x74,
+  0x75, 0x72, 0x6e, 0x20, 0x68, 0x74, 0x6d, 0x6c, 0x60, 0x0a, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x64, 0x69, 0x76, 0x20, 0x69,
+  0x64, 0x3d, 0x22, 0x63, 0x6f, 0x6e, 0x74, 0x61, 0x69, 0x6e, 0x65, 0x72,
+  0x22, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x3c, 0x68, 0x65, 0x61, 0x64, 0x65, 0x72, 0x3e, 0x0a, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x68,
+  0x31, 0x3e, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x2e, 0x63, 0x70, 0x70, 0x3c,
+  0x2f, 0x68, 0x31, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x3c, 0x2f, 0x68, 0x65, 0x61, 0x64, 0x65, 0x72, 0x3e,
+  0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x3c, 0x6d, 0x61, 0x69, 0x6e, 0x20, 0x69, 0x64, 0x3d, 0x22, 0x63, 0x6f,
+  0x6e, 0x74, 0x65, 0x6e, 0x74, 0x22, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x24, 0x7b, 0x63,
+  0x68, 0x61, 0x74, 0x53, 0x74, 0x61, 0x72, 0x74, 0x65, 0x64, 0x2e, 0x76,
+  0x61, 0x6c, 0x75, 0x65, 0x20, 0x3f, 0x20, 0x43, 0x68, 0x61, 0x74, 0x4c,
+  0x6f, 0x67, 0x20, 0x3a, 0x20, 0x43, 0x6f, 0x6e, 0x66, 0x69, 0x67, 0x46,
+  0x6f, 0x72, 0x6d, 0x7d, 0x20, 0x2f, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x2f, 0x6d, 0x61, 0x69, 0x6e,
+  0x3e, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x3c, 0x73, 0x65, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x69, 0x64,
+  0x3d, 0x22, 0x77, 0x72, 0x69, 0x74, 0x65, 0x22, 0x3e, 0x0a, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x24,
+  0x7b, 0x4d, 0x65, 0x73, 0x73, 0x61, 0x67, 0x65, 0x49, 0x6e, 0x70, 0x75,
+  0x74, 0x7d, 0x20, 0x2f, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x3c, 0x2f, 0x73, 0x65, 0x63, 0x74, 0x69, 0x6f,
+  0x6e, 0x3e, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x3c, 0x66, 0x6f, 0x6f, 0x74, 0x65, 0x72, 0x3e, 0x0a, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c,
+  0x70, 0x3e, 0x3c, 0x24, 0x7b, 0x4d, 0x6f, 0x64, 0x65, 0x6c, 0x47, 0x65,
+  0x6e, 0x65, 0x72, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x49, 0x6e, 0x66, 0x6f,
+  0x7d, 0x20, 0x2f, 0x3e, 0x3c, 0x2f, 0x70, 0x3e, 0x0a, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x70, 0x3e,
+  0x50, 0x6f, 0x77, 0x65, 0x72, 0x65, 0x64, 0x20, 0x62, 0x79, 0x20, 0x3c,
+  0x61, 0x20, 0x68, 0x72, 0x65, 0x66, 0x3d, 0x22, 0x68, 0x74, 0x74, 0x70,
+  0x73, 0x3a, 0x2f, 0x2f, 0x67, 0x69, 0x74, 0x68, 0x75, 0x62, 0x2e, 0x63,
+  0x6f, 0x6d, 0x2f, 0x67, 0x67, 0x65, 0x72, 0x67, 0x61, 0x6e, 0x6f, 0x76,
+  0x2f, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x2e, 0x63, 0x70, 0x70, 0x22, 0x3e,
+  0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x2e, 0x63, 0x70, 0x70, 0x3c, 0x2f, 0x61,
+  0x3e, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x3c, 0x61, 0x20, 0x68, 0x72, 0x65,
+  0x66, 0x3d, 0x22, 0x68, 0x74, 0x74, 0x70, 0x73, 0x3a, 0x2f, 0x2f, 0x67,
+  0x67, 0x6d, 0x6c, 0x2e, 0x61, 0x69, 0x22, 0x3e, 0x67, 0x67, 0x6d, 0x6c,
+  0x2e, 0x61, 0x69, 0x3c, 0x2f, 0x61, 0x3e, 0x2e, 0x3c, 0x2f, 0x70, 0x3e,
+  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c,
+  0x2f, 0x66, 0x6f, 0x6f, 0x74, 0x65, 0x72, 0x3e, 0x0a, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x2f, 0x64, 0x69, 0x76, 0x3e, 0x0a,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x60, 0x3b, 0x0a, 0x20, 0x20, 0x20,
+  0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x6e, 0x64,
+  0x65, 0x72, 0x28, 0x68, 0x28, 0x41, 0x70, 0x70, 0x29, 0x2c, 0x20, 0x64,
+  0x6f, 0x63, 0x75, 0x6d, 0x65, 0x6e, 0x74, 0x2e, 0x62, 0x6f, 0x64, 0x79,
+  0x29, 0x3b, 0x0a, 0x20, 0x20, 0x3c, 0x2f, 0x73, 0x63, 0x72, 0x69, 0x70,
+  0x74, 0x3e, 0x0a, 0x3c, 0x2f, 0x68, 0x65, 0x61, 0x64, 0x3e, 0x0a, 0x0a,
+  0x3c, 0x62, 0x6f, 0x64, 0x79, 0x3e, 0x0a, 0x3c, 0x2f, 0x62, 0x6f, 0x64,
+  0x79, 0x3e, 0x0a, 0x0a, 0x3c, 0x2f, 0x68, 0x74, 0x6d, 0x6c, 0x3e, 0x0a
+};
+unsigned int index_html_len = 10752;
diff --git a/examples/server/index.js.hpp b/examples/server/index.js.hpp
new file mode 100644
index 000000000..a3b5be6d8
--- /dev/null
+++ b/examples/server/index.js.hpp
@@ -0,0 +1,1851 @@
+unsigned char index_js[] = {
+  0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x74, 0x28, 0x29,
+  0x7b, 0x74, 0x68, 0x72, 0x6f, 0x77, 0x20, 0x6e, 0x65, 0x77, 0x20, 0x45,
+  0x72, 0x72, 0x6f, 0x72, 0x28, 0x22, 0x43, 0x79, 0x63, 0x6c, 0x65, 0x20,
+  0x64, 0x65, 0x74, 0x65, 0x63, 0x74, 0x65, 0x64, 0x22, 0x29, 0x7d, 0x66,
+  0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x6e, 0x28, 0x29, 0x7b,
+  0x69, 0x66, 0x28, 0x6f, 0x3e, 0x31, 0x29, 0x7b, 0x6f, 0x2d, 0x2d, 0x3b,
+  0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x7d, 0x6c, 0x65, 0x74, 0x20, 0x74,
+  0x2c, 0x6e, 0x3d, 0x21, 0x31, 0x3b, 0x77, 0x68, 0x69, 0x6c, 0x65, 0x28,
+  0x76, 0x6f, 0x69, 0x64, 0x20, 0x30, 0x21, 0x3d, 0x3d, 0x5f, 0x29, 0x7b,
+  0x6c, 0x65, 0x74, 0x20, 0x69, 0x3d, 0x5f, 0x3b, 0x5f, 0x3d, 0x76, 0x6f,
+  0x69, 0x64, 0x20, 0x30, 0x3b, 0x72, 0x2b, 0x2b, 0x3b, 0x77, 0x68, 0x69,
+  0x6c, 0x65, 0x28, 0x76, 0x6f, 0x69, 0x64, 0x20, 0x30, 0x21, 0x3d, 0x3d,
+  0x69, 0x29, 0x7b, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x5f, 0x3d, 0x69,
+  0x2e, 0x6f, 0x3b, 0x69, 0x2e, 0x6f, 0x3d, 0x76, 0x6f, 0x69, 0x64, 0x20,
+  0x30, 0x3b, 0x69, 0x2e, 0x66, 0x26, 0x3d, 0x2d, 0x33, 0x3b, 0x69, 0x66,
+  0x28, 0x21, 0x28, 0x38, 0x26, 0x69, 0x2e, 0x66, 0x29, 0x26, 0x26, 0x63,
+  0x28, 0x69, 0x29, 0x29, 0x74, 0x72, 0x79, 0x7b, 0x69, 0x2e, 0x63, 0x28,
+  0x29, 0x7d, 0x63, 0x61, 0x74, 0x63, 0x68, 0x28, 0x65, 0x29, 0x7b, 0x69,
+  0x66, 0x28, 0x21, 0x6e, 0x29, 0x7b, 0x74, 0x3d, 0x65, 0x3b, 0x6e, 0x3d,
+  0x21, 0x30, 0x7d, 0x7d, 0x69, 0x3d, 0x5f, 0x7d, 0x7d, 0x72, 0x3d, 0x30,
+  0x3b, 0x6f, 0x2d, 0x2d, 0x3b, 0x69, 0x66, 0x28, 0x6e, 0x29, 0x74, 0x68,
+  0x72, 0x6f, 0x77, 0x20, 0x74, 0x7d, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69,
+  0x6f, 0x6e, 0x20, 0x65, 0x28, 0x74, 0x29, 0x7b, 0x69, 0x66, 0x28, 0x6f,
+  0x3e, 0x30, 0x29, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x74, 0x28,
+  0x29, 0x3b, 0x6f, 0x2b, 0x2b, 0x3b, 0x74, 0x72, 0x79, 0x7b, 0x72, 0x65,
+  0x74, 0x75, 0x72, 0x6e, 0x20, 0x74, 0x28, 0x29, 0x7d, 0x66, 0x69, 0x6e,
+  0x61, 0x6c, 0x6c, 0x79, 0x7b, 0x6e, 0x28, 0x29, 0x7d, 0x7d, 0x6c, 0x65,
+  0x74, 0x20, 0x69, 0x2c, 0x5f, 0x2c, 0x6f, 0x3d, 0x30, 0x2c, 0x72, 0x3d,
+  0x30, 0x2c, 0x75, 0x3d, 0x30, 0x3b, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69,
+  0x6f, 0x6e, 0x20, 0x6c, 0x28, 0x74, 0x29, 0x7b, 0x69, 0x66, 0x28, 0x76,
+  0x6f, 0x69, 0x64, 0x20, 0x30, 0x3d, 0x3d, 0x3d, 0x69, 0x29, 0x72, 0x65,
+  0x74, 0x75, 0x72, 0x6e, 0x3b, 0x6c, 0x65, 0x74, 0x20, 0x6e, 0x3d, 0x74,
+  0x2e, 0x6e, 0x3b, 0x69, 0x66, 0x28, 0x76, 0x6f, 0x69, 0x64, 0x20, 0x30,
+  0x3d, 0x3d, 0x3d, 0x6e, 0x7c, 0x7c, 0x6e, 0x2e, 0x74, 0x21, 0x3d, 0x3d,
+  0x69, 0x29, 0x7b, 0x6e, 0x3d, 0x7b, 0x69, 0x3a, 0x30, 0x2c, 0x53, 0x3a,
+  0x74, 0x2c, 0x70, 0x3a, 0x69, 0x2e, 0x73, 0x2c, 0x6e, 0x3a, 0x76, 0x6f,
+  0x69, 0x64, 0x20, 0x30, 0x2c, 0x74, 0x3a, 0x69, 0x2c, 0x65, 0x3a, 0x76,
+  0x6f, 0x69, 0x64, 0x20, 0x30, 0x2c, 0x78, 0x3a, 0x76, 0x6f, 0x69, 0x64,
+  0x20, 0x30, 0x2c, 0x72, 0x3a, 0x6e, 0x7d, 0x3b, 0x69, 0x66, 0x28, 0x76,
+  0x6f, 0x69, 0x64, 0x20, 0x30, 0x21, 0x3d, 0x3d, 0x69, 0x2e, 0x73, 0x29,
+  0x69, 0x2e, 0x73, 0x2e, 0x6e, 0x3d, 0x6e, 0x3b, 0x69, 0x2e, 0x73, 0x3d,
+  0x6e, 0x3b, 0x74, 0x2e, 0x6e, 0x3d, 0x6e, 0x3b, 0x69, 0x66, 0x28, 0x33,
+  0x32, 0x26, 0x69, 0x2e, 0x66, 0x29, 0x74, 0x2e, 0x53, 0x28, 0x6e, 0x29,
+  0x3b, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x6e, 0x7d, 0x65, 0x6c,
+  0x73, 0x65, 0x20, 0x69, 0x66, 0x28, 0x2d, 0x31, 0x3d, 0x3d, 0x3d, 0x6e,
+  0x2e, 0x69, 0x29, 0x7b, 0x6e, 0x2e, 0x69, 0x3d, 0x30, 0x3b, 0x69, 0x66,
+  0x28, 0x76, 0x6f, 0x69, 0x64, 0x20, 0x30, 0x21, 0x3d, 0x3d, 0x6e, 0x2e,
+  0x6e, 0x29, 0x7b, 0x6e, 0x2e, 0x6e, 0x2e, 0x70, 0x3d, 0x6e, 0x2e, 0x70,
+  0x3b, 0x69, 0x66, 0x28, 0x76, 0x6f, 0x69, 0x64, 0x20, 0x30, 0x21, 0x3d,
+  0x3d, 0x6e, 0x2e, 0x70, 0x29, 0x6e, 0x2e, 0x70, 0x2e, 0x6e, 0x3d, 0x6e,
+  0x2e, 0x6e, 0x3b, 0x6e, 0x2e, 0x70, 0x3d, 0x69, 0x2e, 0x73, 0x3b, 0x6e,
+  0x2e, 0x6e, 0x3d, 0x76, 0x6f, 0x69, 0x64, 0x20, 0x30, 0x3b, 0x69, 0x2e,
+  0x73, 0x2e, 0x6e, 0x3d, 0x6e, 0x3b, 0x69, 0x2e, 0x73, 0x3d, 0x6e, 0x7d,
+  0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x6e, 0x7d, 0x7d, 0x66, 0x75,
+  0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x66, 0x28, 0x74, 0x29, 0x7b,
+  0x74, 0x68, 0x69, 0x73, 0x2e, 0x76, 0x3d, 0x74, 0x3b, 0x74, 0x68, 0x69,
+  0x73, 0x2e, 0x69, 0x3d, 0x30, 0x3b, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x6e,
+  0x3d, 0x76, 0x6f, 0x69, 0x64, 0x20, 0x30, 0x3b, 0x74, 0x68, 0x69, 0x73,
+  0x2e, 0x74, 0x3d, 0x76, 0x6f, 0x69, 0x64, 0x20, 0x30, 0x7d, 0x66, 0x2e,
+  0x70, 0x72, 0x6f, 0x74, 0x6f, 0x74, 0x79, 0x70, 0x65, 0x2e, 0x68, 0x3d,
+  0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x29, 0x7b, 0x72,
+  0x65, 0x74, 0x75, 0x72, 0x6e, 0x21, 0x30, 0x7d, 0x3b, 0x66, 0x2e, 0x70,
+  0x72, 0x6f, 0x74, 0x6f, 0x74, 0x79, 0x70, 0x65, 0x2e, 0x53, 0x3d, 0x66,
+  0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x74, 0x29, 0x7b, 0x69,
+  0x66, 0x28, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x74, 0x21, 0x3d, 0x3d, 0x74,
+  0x26, 0x26, 0x76, 0x6f, 0x69, 0x64, 0x20, 0x30, 0x3d, 0x3d, 0x3d, 0x74,
+  0x2e, 0x65, 0x29, 0x7b, 0x74, 0x2e, 0x78, 0x3d, 0x74, 0x68, 0x69, 0x73,
+  0x2e, 0x74, 0x3b, 0x69, 0x66, 0x28, 0x76, 0x6f, 0x69, 0x64, 0x20, 0x30,
+  0x21, 0x3d, 0x3d, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x74, 0x29, 0x74, 0x68,
+  0x69, 0x73, 0x2e, 0x74, 0x2e, 0x65, 0x3d, 0x74, 0x3b, 0x74, 0x68, 0x69,
+  0x73, 0x2e, 0x74, 0x3d, 0x74, 0x7d, 0x7d, 0x3b, 0x66, 0x2e, 0x70, 0x72,
+  0x6f, 0x74, 0x6f, 0x74, 0x79, 0x70, 0x65, 0x2e, 0x55, 0x3d, 0x66, 0x75,
+  0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x74, 0x29, 0x7b, 0x69, 0x66,
+  0x28, 0x76, 0x6f, 0x69, 0x64, 0x20, 0x30, 0x21, 0x3d, 0x3d, 0x74, 0x68,
+  0x69, 0x73, 0x2e, 0x74, 0x29, 0x7b, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20,
+  0x6e, 0x3d, 0x74, 0x2e, 0x65, 0x2c, 0x65, 0x3d, 0x74, 0x2e, 0x78, 0x3b,
+  0x69, 0x66, 0x28, 0x76, 0x6f, 0x69, 0x64, 0x20, 0x30, 0x21, 0x3d, 0x3d,
+  0x6e, 0x29, 0x7b, 0x6e, 0x2e, 0x78, 0x3d, 0x65, 0x3b, 0x74, 0x2e, 0x65,
+  0x3d, 0x76, 0x6f, 0x69, 0x64, 0x20, 0x30, 0x7d, 0x69, 0x66, 0x28, 0x76,
+  0x6f, 0x69, 0x64, 0x20, 0x30, 0x21, 0x3d, 0x3d, 0x65, 0x29, 0x7b, 0x65,
+  0x2e, 0x65, 0x3d, 0x6e, 0x3b, 0x74, 0x2e, 0x78, 0x3d, 0x76, 0x6f, 0x69,
+  0x64, 0x20, 0x30, 0x7d, 0x69, 0x66, 0x28, 0x74, 0x3d, 0x3d, 0x3d, 0x74,
+  0x68, 0x69, 0x73, 0x2e, 0x74, 0x29, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x74,
+  0x3d, 0x65, 0x7d, 0x7d, 0x3b, 0x66, 0x2e, 0x70, 0x72, 0x6f, 0x74, 0x6f,
+  0x74, 0x79, 0x70, 0x65, 0x2e, 0x73, 0x75, 0x62, 0x73, 0x63, 0x72, 0x69,
+  0x62, 0x65, 0x3d, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x28,
+  0x74, 0x29, 0x7b, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x6e, 0x3d, 0x74,
+  0x68, 0x69, 0x73, 0x3b, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x62,
+  0x28, 0x28, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x29,
+  0x7b, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x65, 0x3d, 0x6e, 0x2e, 0x76,
+  0x61, 0x6c, 0x75, 0x65, 0x2c, 0x69, 0x3d, 0x33, 0x32, 0x26, 0x74, 0x68,
+  0x69, 0x73, 0x2e, 0x66, 0x3b, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x66, 0x26,
+  0x3d, 0x2d, 0x33, 0x33, 0x3b, 0x74, 0x72, 0x79, 0x7b, 0x74, 0x28, 0x65,
+  0x29, 0x7d, 0x66, 0x69, 0x6e, 0x61, 0x6c, 0x6c, 0x79, 0x7b, 0x74, 0x68,
+  0x69, 0x73, 0x2e, 0x66, 0x7c, 0x3d, 0x69, 0x7d, 0x7d, 0x29, 0x29, 0x7d,
+  0x3b, 0x66, 0x2e, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x74, 0x79, 0x70, 0x65,
+  0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x4f, 0x66, 0x3d, 0x66, 0x75, 0x6e,
+  0x63, 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x29, 0x7b, 0x72, 0x65, 0x74, 0x75,
+  0x72, 0x6e, 0x20, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x76, 0x61, 0x6c, 0x75,
+  0x65, 0x7d, 0x3b, 0x66, 0x2e, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x74, 0x79,
+  0x70, 0x65, 0x2e, 0x74, 0x6f, 0x53, 0x74, 0x72, 0x69, 0x6e, 0x67, 0x3d,
+  0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x29, 0x7b, 0x72,
+  0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x76,
+  0x61, 0x6c, 0x75, 0x65, 0x2b, 0x22, 0x22, 0x7d, 0x3b, 0x66, 0x2e, 0x70,
+  0x72, 0x6f, 0x74, 0x6f, 0x74, 0x79, 0x70, 0x65, 0x2e, 0x74, 0x6f, 0x4a,
+  0x53, 0x4f, 0x4e, 0x3d, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e,
+  0x28, 0x29, 0x7b, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x74, 0x68,
+  0x69, 0x73, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x7d, 0x3b, 0x66, 0x2e,
+  0x70, 0x72, 0x6f, 0x74, 0x6f, 0x74, 0x79, 0x70, 0x65, 0x2e, 0x70, 0x65,
+  0x65, 0x6b, 0x3d, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x28,
+  0x29, 0x7b, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x74, 0x68, 0x69,
+  0x73, 0x2e, 0x76, 0x7d, 0x3b, 0x4f, 0x62, 0x6a, 0x65, 0x63, 0x74, 0x2e,
+  0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x50, 0x72, 0x6f, 0x70, 0x65, 0x72,
+  0x74, 0x79, 0x28, 0x66, 0x2e, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x74, 0x79,
+  0x70, 0x65, 0x2c, 0x22, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x22, 0x2c, 0x7b,
+  0x67, 0x65, 0x74, 0x28, 0x29, 0x7b, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20,
+  0x74, 0x3d, 0x6c, 0x28, 0x74, 0x68, 0x69, 0x73, 0x29, 0x3b, 0x69, 0x66,
+  0x28, 0x76, 0x6f, 0x69, 0x64, 0x20, 0x30, 0x21, 0x3d, 0x3d, 0x74, 0x29,
+  0x74, 0x2e, 0x69, 0x3d, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x69, 0x3b, 0x72,
+  0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x76,
+  0x7d, 0x2c, 0x73, 0x65, 0x74, 0x28, 0x65, 0x29, 0x7b, 0x69, 0x66, 0x28,
+  0x69, 0x20, 0x69, 0x6e, 0x73, 0x74, 0x61, 0x6e, 0x63, 0x65, 0x6f, 0x66,
+  0x20, 0x70, 0x29, 0x21, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e,
+  0x28, 0x29, 0x7b, 0x74, 0x68, 0x72, 0x6f, 0x77, 0x20, 0x6e, 0x65, 0x77,
+  0x20, 0x45, 0x72, 0x72, 0x6f, 0x72, 0x28, 0x22, 0x43, 0x6f, 0x6d, 0x70,
+  0x75, 0x74, 0x65, 0x64, 0x20, 0x63, 0x61, 0x6e, 0x6e, 0x6f, 0x74, 0x20,
+  0x68, 0x61, 0x76, 0x65, 0x20, 0x73, 0x69, 0x64, 0x65, 0x2d, 0x65, 0x66,
+  0x66, 0x65, 0x63, 0x74, 0x73, 0x22, 0x29, 0x7d, 0x28, 0x29, 0x3b, 0x69,
+  0x66, 0x28, 0x65, 0x21, 0x3d, 0x3d, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x76,
+  0x29, 0x7b, 0x69, 0x66, 0x28, 0x72, 0x3e, 0x31, 0x30, 0x30, 0x29, 0x74,
+  0x28, 0x29, 0x3b, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x76, 0x3d, 0x65, 0x3b,
+  0x74, 0x68, 0x69, 0x73, 0x2e, 0x69, 0x2b, 0x2b, 0x3b, 0x75, 0x2b, 0x2b,
+  0x3b, 0x6f, 0x2b, 0x2b, 0x3b, 0x74, 0x72, 0x79, 0x7b, 0x66, 0x6f, 0x72,
+  0x28, 0x6c, 0x65, 0x74, 0x20, 0x74, 0x3d, 0x74, 0x68, 0x69, 0x73, 0x2e,
+  0x74, 0x3b, 0x76, 0x6f, 0x69, 0x64, 0x20, 0x30, 0x21, 0x3d, 0x3d, 0x74,
+  0x3b, 0x74, 0x3d, 0x74, 0x2e, 0x78, 0x29, 0x74, 0x2e, 0x74, 0x2e, 0x4e,
+  0x28, 0x29, 0x7d, 0x66, 0x69, 0x6e, 0x61, 0x6c, 0x6c, 0x79, 0x7b, 0x6e,
+  0x28, 0x29, 0x7d, 0x7d, 0x7d, 0x7d, 0x29, 0x3b, 0x66, 0x75, 0x6e, 0x63,
+  0x74, 0x69, 0x6f, 0x6e, 0x20, 0x73, 0x28, 0x74, 0x29, 0x7b, 0x72, 0x65,
+  0x74, 0x75, 0x72, 0x6e, 0x20, 0x6e, 0x65, 0x77, 0x20, 0x66, 0x28, 0x74,
+  0x29, 0x7d, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x63,
+  0x28, 0x74, 0x29, 0x7b, 0x66, 0x6f, 0x72, 0x28, 0x6c, 0x65, 0x74, 0x20,
+  0x6e, 0x3d, 0x74, 0x2e, 0x73, 0x3b, 0x76, 0x6f, 0x69, 0x64, 0x20, 0x30,
+  0x21, 0x3d, 0x3d, 0x6e, 0x3b, 0x6e, 0x3d, 0x6e, 0x2e, 0x6e, 0x29, 0x69,
+  0x66, 0x28, 0x6e, 0x2e, 0x53, 0x2e, 0x69, 0x21, 0x3d, 0x3d, 0x6e, 0x2e,
+  0x69, 0x7c, 0x7c, 0x21, 0x6e, 0x2e, 0x53, 0x2e, 0x68, 0x28, 0x29, 0x7c,
+  0x7c, 0x6e, 0x2e, 0x53, 0x2e, 0x69, 0x21, 0x3d, 0x3d, 0x6e, 0x2e, 0x69,
+  0x29, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x21, 0x30, 0x3b, 0x72, 0x65,
+  0x74, 0x75, 0x72, 0x6e, 0x21, 0x31, 0x7d, 0x66, 0x75, 0x6e, 0x63, 0x74,
+  0x69, 0x6f, 0x6e, 0x20, 0x68, 0x28, 0x74, 0x29, 0x7b, 0x66, 0x6f, 0x72,
+  0x28, 0x6c, 0x65, 0x74, 0x20, 0x6e, 0x3d, 0x74, 0x2e, 0x73, 0x3b, 0x76,
+  0x6f, 0x69, 0x64, 0x20, 0x30, 0x21, 0x3d, 0x3d, 0x6e, 0x3b, 0x6e, 0x3d,
+  0x6e, 0x2e, 0x6e, 0x29, 0x7b, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x65,
+  0x3d, 0x6e, 0x2e, 0x53, 0x2e, 0x6e, 0x3b, 0x69, 0x66, 0x28, 0x76, 0x6f,
+  0x69, 0x64, 0x20, 0x30, 0x21, 0x3d, 0x3d, 0x65, 0x29, 0x6e, 0x2e, 0x72,
+  0x3d, 0x65, 0x3b, 0x6e, 0x2e, 0x53, 0x2e, 0x6e, 0x3d, 0x6e, 0x3b, 0x6e,
+  0x2e, 0x69, 0x3d, 0x2d, 0x31, 0x3b, 0x69, 0x66, 0x28, 0x76, 0x6f, 0x69,
+  0x64, 0x20, 0x30, 0x3d, 0x3d, 0x3d, 0x6e, 0x2e, 0x6e, 0x29, 0x7b, 0x74,
+  0x2e, 0x73, 0x3d, 0x6e, 0x3b, 0x62, 0x72, 0x65, 0x61, 0x6b, 0x7d, 0x7d,
+  0x7d, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x61, 0x28,
+  0x74, 0x29, 0x7b, 0x6c, 0x65, 0x74, 0x20, 0x6e, 0x2c, 0x65, 0x3d, 0x74,
+  0x2e, 0x73, 0x3b, 0x77, 0x68, 0x69, 0x6c, 0x65, 0x28, 0x76, 0x6f, 0x69,
+  0x64, 0x20, 0x30, 0x21, 0x3d, 0x3d, 0x65, 0x29, 0x7b, 0x63, 0x6f, 0x6e,
+  0x73, 0x74, 0x20, 0x74, 0x3d, 0x65, 0x2e, 0x70, 0x3b, 0x69, 0x66, 0x28,
+  0x2d, 0x31, 0x3d, 0x3d, 0x3d, 0x65, 0x2e, 0x69, 0x29, 0x7b, 0x65, 0x2e,
+  0x53, 0x2e, 0x55, 0x28, 0x65, 0x29, 0x3b, 0x69, 0x66, 0x28, 0x76, 0x6f,
+  0x69, 0x64, 0x20, 0x30, 0x21, 0x3d, 0x3d, 0x74, 0x29, 0x74, 0x2e, 0x6e,
+  0x3d, 0x65, 0x2e, 0x6e, 0x3b, 0x69, 0x66, 0x28, 0x76, 0x6f, 0x69, 0x64,
+  0x20, 0x30, 0x21, 0x3d, 0x3d, 0x65, 0x2e, 0x6e, 0x29, 0x65, 0x2e, 0x6e,
+  0x2e, 0x70, 0x3d, 0x74, 0x7d, 0x65, 0x6c, 0x73, 0x65, 0x20, 0x6e, 0x3d,
+  0x65, 0x3b, 0x65, 0x2e, 0x53, 0x2e, 0x6e, 0x3d, 0x65, 0x2e, 0x72, 0x3b,
+  0x69, 0x66, 0x28, 0x76, 0x6f, 0x69, 0x64, 0x20, 0x30, 0x21, 0x3d, 0x3d,
+  0x65, 0x2e, 0x72, 0x29, 0x65, 0x2e, 0x72, 0x3d, 0x76, 0x6f, 0x69, 0x64,
+  0x20, 0x30, 0x3b, 0x65, 0x3d, 0x74, 0x7d, 0x74, 0x2e, 0x73, 0x3d, 0x6e,
+  0x7d, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x70, 0x28,
+  0x74, 0x29, 0x7b, 0x66, 0x2e, 0x63, 0x61, 0x6c, 0x6c, 0x28, 0x74, 0x68,
+  0x69, 0x73, 0x2c, 0x76, 0x6f, 0x69, 0x64, 0x20, 0x30, 0x29, 0x3b, 0x74,
+  0x68, 0x69, 0x73, 0x2e, 0x78, 0x3d, 0x74, 0x3b, 0x74, 0x68, 0x69, 0x73,
+  0x2e, 0x73, 0x3d, 0x76, 0x6f, 0x69, 0x64, 0x20, 0x30, 0x3b, 0x74, 0x68,
+  0x69, 0x73, 0x2e, 0x67, 0x3d, 0x75, 0x2d, 0x31, 0x3b, 0x74, 0x68, 0x69,
+  0x73, 0x2e, 0x66, 0x3d, 0x34, 0x7d, 0x28, 0x70, 0x2e, 0x70, 0x72, 0x6f,
+  0x74, 0x6f, 0x74, 0x79, 0x70, 0x65, 0x3d, 0x6e, 0x65, 0x77, 0x20, 0x66,
+  0x29, 0x2e, 0x68, 0x3d, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e,
+  0x28, 0x29, 0x7b, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x66, 0x26, 0x3d, 0x2d,
+  0x33, 0x3b, 0x69, 0x66, 0x28, 0x31, 0x26, 0x74, 0x68, 0x69, 0x73, 0x2e,
+  0x66, 0x29, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x21, 0x31, 0x3b, 0x69,
+  0x66, 0x28, 0x33, 0x32, 0x3d, 0x3d, 0x28, 0x33, 0x36, 0x26, 0x74, 0x68,
+  0x69, 0x73, 0x2e, 0x66, 0x29, 0x29, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e,
+  0x21, 0x30, 0x3b, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x66, 0x26, 0x3d, 0x2d,
+  0x35, 0x3b, 0x69, 0x66, 0x28, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x67, 0x3d,
+  0x3d, 0x3d, 0x75, 0x29, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x21, 0x30,
+  0x3b, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x67, 0x3d, 0x75, 0x3b, 0x74, 0x68,
+  0x69, 0x73, 0x2e, 0x66, 0x7c, 0x3d, 0x31, 0x3b, 0x69, 0x66, 0x28, 0x74,
+  0x68, 0x69, 0x73, 0x2e, 0x69, 0x3e, 0x30, 0x26, 0x26, 0x21, 0x63, 0x28,
+  0x74, 0x68, 0x69, 0x73, 0x29, 0x29, 0x7b, 0x74, 0x68, 0x69, 0x73, 0x2e,
+  0x66, 0x26, 0x3d, 0x2d, 0x32, 0x3b, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e,
+  0x21, 0x30, 0x7d, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x74, 0x3d, 0x69,
+  0x3b, 0x74, 0x72, 0x79, 0x7b, 0x68, 0x28, 0x74, 0x68, 0x69, 0x73, 0x29,
+  0x3b, 0x69, 0x3d, 0x74, 0x68, 0x69, 0x73, 0x3b, 0x63, 0x6f, 0x6e, 0x73,
+  0x74, 0x20, 0x74, 0x3d, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x78, 0x28, 0x29,
+  0x3b, 0x69, 0x66, 0x28, 0x31, 0x36, 0x26, 0x74, 0x68, 0x69, 0x73, 0x2e,
+  0x66, 0x7c, 0x7c, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x76, 0x21, 0x3d, 0x3d,
+  0x74, 0x7c, 0x7c, 0x30, 0x3d, 0x3d, 0x3d, 0x74, 0x68, 0x69, 0x73, 0x2e,
+  0x69, 0x29, 0x7b, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x76, 0x3d, 0x74, 0x3b,
+  0x74, 0x68, 0x69, 0x73, 0x2e, 0x66, 0x26, 0x3d, 0x2d, 0x31, 0x37, 0x3b,
+  0x74, 0x68, 0x69, 0x73, 0x2e, 0x69, 0x2b, 0x2b, 0x7d, 0x7d, 0x63, 0x61,
+  0x74, 0x63, 0x68, 0x28, 0x74, 0x29, 0x7b, 0x74, 0x68, 0x69, 0x73, 0x2e,
+  0x76, 0x3d, 0x74, 0x3b, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x66, 0x7c, 0x3d,
+  0x31, 0x36, 0x3b, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x69, 0x2b, 0x2b, 0x7d,
+  0x69, 0x3d, 0x74, 0x3b, 0x61, 0x28, 0x74, 0x68, 0x69, 0x73, 0x29, 0x3b,
+  0x74, 0x68, 0x69, 0x73, 0x2e, 0x66, 0x26, 0x3d, 0x2d, 0x32, 0x3b, 0x72,
+  0x65, 0x74, 0x75, 0x72, 0x6e, 0x21, 0x30, 0x7d, 0x3b, 0x70, 0x2e, 0x70,
+  0x72, 0x6f, 0x74, 0x6f, 0x74, 0x79, 0x70, 0x65, 0x2e, 0x53, 0x3d, 0x66,
+  0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x74, 0x29, 0x7b, 0x69,
+  0x66, 0x28, 0x76, 0x6f, 0x69, 0x64, 0x20, 0x30, 0x3d, 0x3d, 0x3d, 0x74,
+  0x68, 0x69, 0x73, 0x2e, 0x74, 0x29, 0x7b, 0x74, 0x68, 0x69, 0x73, 0x2e,
+  0x66, 0x7c, 0x3d, 0x33, 0x36, 0x3b, 0x66, 0x6f, 0x72, 0x28, 0x6c, 0x65,
+  0x74, 0x20, 0x74, 0x3d, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x73, 0x3b, 0x76,
+  0x6f, 0x69, 0x64, 0x20, 0x30, 0x21, 0x3d, 0x3d, 0x74, 0x3b, 0x74, 0x3d,
+  0x74, 0x2e, 0x6e, 0x29, 0x74, 0x2e, 0x53, 0x2e, 0x53, 0x28, 0x74, 0x29,
+  0x7d, 0x66, 0x2e, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x74, 0x79, 0x70, 0x65,
+  0x2e, 0x53, 0x2e, 0x63, 0x61, 0x6c, 0x6c, 0x28, 0x74, 0x68, 0x69, 0x73,
+  0x2c, 0x74, 0x29, 0x7d, 0x3b, 0x70, 0x2e, 0x70, 0x72, 0x6f, 0x74, 0x6f,
+  0x74, 0x79, 0x70, 0x65, 0x2e, 0x55, 0x3d, 0x66, 0x75, 0x6e, 0x63, 0x74,
+  0x69, 0x6f, 0x6e, 0x28, 0x74, 0x29, 0x7b, 0x69, 0x66, 0x28, 0x76, 0x6f,
+  0x69, 0x64, 0x20, 0x30, 0x21, 0x3d, 0x3d, 0x74, 0x68, 0x69, 0x73, 0x2e,
+  0x74, 0x29, 0x7b, 0x66, 0x2e, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x74, 0x79,
+  0x70, 0x65, 0x2e, 0x55, 0x2e, 0x63, 0x61, 0x6c, 0x6c, 0x28, 0x74, 0x68,
+  0x69, 0x73, 0x2c, 0x74, 0x29, 0x3b, 0x69, 0x66, 0x28, 0x76, 0x6f, 0x69,
+  0x64, 0x20, 0x30, 0x3d, 0x3d, 0x3d, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x74,
+  0x29, 0x7b, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x66, 0x26, 0x3d, 0x2d, 0x33,
+  0x33, 0x3b, 0x66, 0x6f, 0x72, 0x28, 0x6c, 0x65, 0x74, 0x20, 0x74, 0x3d,
+  0x74, 0x68, 0x69, 0x73, 0x2e, 0x73, 0x3b, 0x76, 0x6f, 0x69, 0x64, 0x20,
+  0x30, 0x21, 0x3d, 0x3d, 0x74, 0x3b, 0x74, 0x3d, 0x74, 0x2e, 0x6e, 0x29,
+  0x74, 0x2e, 0x53, 0x2e, 0x55, 0x28, 0x74, 0x29, 0x7d, 0x7d, 0x7d, 0x3b,
+  0x70, 0x2e, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x74, 0x79, 0x70, 0x65, 0x2e,
+  0x4e, 0x3d, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x29,
+  0x7b, 0x69, 0x66, 0x28, 0x21, 0x28, 0x32, 0x26, 0x74, 0x68, 0x69, 0x73,
+  0x2e, 0x66, 0x29, 0x29, 0x7b, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x66, 0x7c,
+  0x3d, 0x36, 0x3b, 0x66, 0x6f, 0x72, 0x28, 0x6c, 0x65, 0x74, 0x20, 0x74,
+  0x3d, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x74, 0x3b, 0x76, 0x6f, 0x69, 0x64,
+  0x20, 0x30, 0x21, 0x3d, 0x3d, 0x74, 0x3b, 0x74, 0x3d, 0x74, 0x2e, 0x78,
+  0x29, 0x74, 0x2e, 0x74, 0x2e, 0x4e, 0x28, 0x29, 0x7d, 0x7d, 0x3b, 0x70,
+  0x2e, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x74, 0x79, 0x70, 0x65, 0x2e, 0x70,
+  0x65, 0x65, 0x6b, 0x3d, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e,
+  0x28, 0x29, 0x7b, 0x69, 0x66, 0x28, 0x21, 0x74, 0x68, 0x69, 0x73, 0x2e,
+  0x68, 0x28, 0x29, 0x29, 0x74, 0x28, 0x29, 0x3b, 0x69, 0x66, 0x28, 0x31,
+  0x36, 0x26, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x66, 0x29, 0x74, 0x68, 0x72,
+  0x6f, 0x77, 0x20, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x76, 0x3b, 0x72, 0x65,
+  0x74, 0x75, 0x72, 0x6e, 0x20, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x76, 0x7d,
+  0x3b, 0x4f, 0x62, 0x6a, 0x65, 0x63, 0x74, 0x2e, 0x64, 0x65, 0x66, 0x69,
+  0x6e, 0x65, 0x50, 0x72, 0x6f, 0x70, 0x65, 0x72, 0x74, 0x79, 0x28, 0x70,
+  0x2e, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x74, 0x79, 0x70, 0x65, 0x2c, 0x22,
+  0x76, 0x61, 0x6c, 0x75, 0x65, 0x22, 0x2c, 0x7b, 0x67, 0x65, 0x74, 0x28,
+  0x29, 0x7b, 0x69, 0x66, 0x28, 0x31, 0x26, 0x74, 0x68, 0x69, 0x73, 0x2e,
+  0x66, 0x29, 0x74, 0x28, 0x29, 0x3b, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20,
+  0x6e, 0x3d, 0x6c, 0x28, 0x74, 0x68, 0x69, 0x73, 0x29, 0x3b, 0x74, 0x68,
+  0x69, 0x73, 0x2e, 0x68, 0x28, 0x29, 0x3b, 0x69, 0x66, 0x28, 0x76, 0x6f,
+  0x69, 0x64, 0x20, 0x30, 0x21, 0x3d, 0x3d, 0x6e, 0x29, 0x6e, 0x2e, 0x69,
+  0x3d, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x69, 0x3b, 0x69, 0x66, 0x28, 0x31,
+  0x36, 0x26, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x66, 0x29, 0x74, 0x68, 0x72,
+  0x6f, 0x77, 0x20, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x76, 0x3b, 0x72, 0x65,
+  0x74, 0x75, 0x72, 0x6e, 0x20, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x76, 0x7d,
+  0x7d, 0x29, 0x3b, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20,
+  0x64, 0x28, 0x74, 0x29, 0x7b, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20,
+  0x6e, 0x65, 0x77, 0x20, 0x70, 0x28, 0x74, 0x29, 0x7d, 0x66, 0x75, 0x6e,
+  0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x76, 0x28, 0x74, 0x29, 0x7b, 0x63,
+  0x6f, 0x6e, 0x73, 0x74, 0x20, 0x65, 0x3d, 0x74, 0x2e, 0x75, 0x3b, 0x74,
+  0x2e, 0x75, 0x3d, 0x76, 0x6f, 0x69, 0x64, 0x20, 0x30, 0x3b, 0x69, 0x66,
+  0x28, 0x22, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x22, 0x3d,
+  0x3d, 0x74, 0x79, 0x70, 0x65, 0x6f, 0x66, 0x20, 0x65, 0x29, 0x7b, 0x6f,
+  0x2b, 0x2b, 0x3b, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x5f, 0x3d, 0x69,
+  0x3b, 0x69, 0x3d, 0x76, 0x6f, 0x69, 0x64, 0x20, 0x30, 0x3b, 0x74, 0x72,
+  0x79, 0x7b, 0x65, 0x28, 0x29, 0x7d, 0x63, 0x61, 0x74, 0x63, 0x68, 0x28,
+  0x6e, 0x29, 0x7b, 0x74, 0x2e, 0x66, 0x26, 0x3d, 0x2d, 0x32, 0x3b, 0x74,
+  0x2e, 0x66, 0x7c, 0x3d, 0x38, 0x3b, 0x79, 0x28, 0x74, 0x29, 0x3b, 0x74,
+  0x68, 0x72, 0x6f, 0x77, 0x20, 0x6e, 0x7d, 0x66, 0x69, 0x6e, 0x61, 0x6c,
+  0x6c, 0x79, 0x7b, 0x69, 0x3d, 0x5f, 0x3b, 0x6e, 0x28, 0x29, 0x7d, 0x7d,
+  0x7d, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x79, 0x28,
+  0x74, 0x29, 0x7b, 0x66, 0x6f, 0x72, 0x28, 0x6c, 0x65, 0x74, 0x20, 0x6e,
+  0x3d, 0x74, 0x2e, 0x73, 0x3b, 0x76, 0x6f, 0x69, 0x64, 0x20, 0x30, 0x21,
+  0x3d, 0x3d, 0x6e, 0x3b, 0x6e, 0x3d, 0x6e, 0x2e, 0x6e, 0x29, 0x6e, 0x2e,
+  0x53, 0x2e, 0x55, 0x28, 0x6e, 0x29, 0x3b, 0x74, 0x2e, 0x78, 0x3d, 0x76,
+  0x6f, 0x69, 0x64, 0x20, 0x30, 0x3b, 0x74, 0x2e, 0x73, 0x3d, 0x76, 0x6f,
+  0x69, 0x64, 0x20, 0x30, 0x3b, 0x76, 0x28, 0x74, 0x29, 0x7d, 0x66, 0x75,
+  0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x6d, 0x28, 0x74, 0x29, 0x7b,
+  0x69, 0x66, 0x28, 0x69, 0x21, 0x3d, 0x3d, 0x74, 0x68, 0x69, 0x73, 0x29,
+  0x74, 0x68, 0x72, 0x6f, 0x77, 0x20, 0x6e, 0x65, 0x77, 0x20, 0x45, 0x72,
+  0x72, 0x6f, 0x72, 0x28, 0x22, 0x4f, 0x75, 0x74, 0x2d, 0x6f, 0x66, 0x2d,
+  0x6f, 0x72, 0x64, 0x65, 0x72, 0x20, 0x65, 0x66, 0x66, 0x65, 0x63, 0x74,
+  0x22, 0x29, 0x3b, 0x61, 0x28, 0x74, 0x68, 0x69, 0x73, 0x29, 0x3b, 0x69,
+  0x3d, 0x74, 0x3b, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x66, 0x26, 0x3d, 0x2d,
+  0x32, 0x3b, 0x69, 0x66, 0x28, 0x38, 0x26, 0x74, 0x68, 0x69, 0x73, 0x2e,
+  0x66, 0x29, 0x79, 0x28, 0x74, 0x68, 0x69, 0x73, 0x29, 0x3b, 0x6e, 0x28,
+  0x29, 0x7d, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x67,
+  0x28, 0x74, 0x29, 0x7b, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x78, 0x3d, 0x74,
+  0x3b, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x75, 0x3d, 0x76, 0x6f, 0x69, 0x64,
+  0x20, 0x30, 0x3b, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x73, 0x3d, 0x76, 0x6f,
+  0x69, 0x64, 0x20, 0x30, 0x3b, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x6f, 0x3d,
+  0x76, 0x6f, 0x69, 0x64, 0x20, 0x30, 0x3b, 0x74, 0x68, 0x69, 0x73, 0x2e,
+  0x66, 0x3d, 0x33, 0x32, 0x7d, 0x67, 0x2e, 0x70, 0x72, 0x6f, 0x74, 0x6f,
+  0x74, 0x79, 0x70, 0x65, 0x2e, 0x63, 0x3d, 0x66, 0x75, 0x6e, 0x63, 0x74,
+  0x69, 0x6f, 0x6e, 0x28, 0x29, 0x7b, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20,
+  0x74, 0x3d, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x53, 0x28, 0x29, 0x3b, 0x74,
+  0x72, 0x79, 0x7b, 0x69, 0x66, 0x28, 0x38, 0x26, 0x74, 0x68, 0x69, 0x73,
+  0x2e, 0x66, 0x29, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x3b, 0x69, 0x66,
+  0x28, 0x76, 0x6f, 0x69, 0x64, 0x20, 0x30, 0x3d, 0x3d, 0x3d, 0x74, 0x68,
+  0x69, 0x73, 0x2e, 0x78, 0x29, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x3b,
+  0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x6e, 0x3d, 0x74, 0x68, 0x69, 0x73,
+  0x2e, 0x78, 0x28, 0x29, 0x3b, 0x69, 0x66, 0x28, 0x22, 0x66, 0x75, 0x6e,
+  0x63, 0x74, 0x69, 0x6f, 0x6e, 0x22, 0x3d, 0x3d, 0x74, 0x79, 0x70, 0x65,
+  0x6f, 0x66, 0x20, 0x6e, 0x29, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x75, 0x3d,
+  0x6e, 0x7d, 0x66, 0x69, 0x6e, 0x61, 0x6c, 0x6c, 0x79, 0x7b, 0x74, 0x28,
+  0x29, 0x7d, 0x7d, 0x3b, 0x67, 0x2e, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x74,
+  0x79, 0x70, 0x65, 0x2e, 0x53, 0x3d, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69,
+  0x6f, 0x6e, 0x28, 0x29, 0x7b, 0x69, 0x66, 0x28, 0x31, 0x26, 0x74, 0x68,
+  0x69, 0x73, 0x2e, 0x66, 0x29, 0x74, 0x28, 0x29, 0x3b, 0x74, 0x68, 0x69,
+  0x73, 0x2e, 0x66, 0x7c, 0x3d, 0x31, 0x3b, 0x74, 0x68, 0x69, 0x73, 0x2e,
+  0x66, 0x26, 0x3d, 0x2d, 0x39, 0x3b, 0x76, 0x28, 0x74, 0x68, 0x69, 0x73,
+  0x29, 0x3b, 0x68, 0x28, 0x74, 0x68, 0x69, 0x73, 0x29, 0x3b, 0x6f, 0x2b,
+  0x2b, 0x3b, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x6e, 0x3d, 0x69, 0x3b,
+  0x69, 0x3d, 0x74, 0x68, 0x69, 0x73, 0x3b, 0x72, 0x65, 0x74, 0x75, 0x72,
+  0x6e, 0x20, 0x6d, 0x2e, 0x62, 0x69, 0x6e, 0x64, 0x28, 0x74, 0x68, 0x69,
+  0x73, 0x2c, 0x6e, 0x29, 0x7d, 0x3b, 0x67, 0x2e, 0x70, 0x72, 0x6f, 0x74,
+  0x6f, 0x74, 0x79, 0x70, 0x65, 0x2e, 0x4e, 0x3d, 0x66, 0x75, 0x6e, 0x63,
+  0x74, 0x69, 0x6f, 0x6e, 0x28, 0x29, 0x7b, 0x69, 0x66, 0x28, 0x21, 0x28,
+  0x32, 0x26, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x66, 0x29, 0x29, 0x7b, 0x74,
+  0x68, 0x69, 0x73, 0x2e, 0x66, 0x7c, 0x3d, 0x32, 0x3b, 0x74, 0x68, 0x69,
+  0x73, 0x2e, 0x6f, 0x3d, 0x5f, 0x3b, 0x5f, 0x3d, 0x74, 0x68, 0x69, 0x73,
+  0x7d, 0x7d, 0x3b, 0x67, 0x2e, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x74, 0x79,
+  0x70, 0x65, 0x2e, 0x64, 0x3d, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f,
+  0x6e, 0x28, 0x29, 0x7b, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x66, 0x7c, 0x3d,
+  0x38, 0x3b, 0x69, 0x66, 0x28, 0x21, 0x28, 0x31, 0x26, 0x74, 0x68, 0x69,
+  0x73, 0x2e, 0x66, 0x29, 0x29, 0x79, 0x28, 0x74, 0x68, 0x69, 0x73, 0x29,
+  0x7d, 0x3b, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x62,
+  0x28, 0x74, 0x29, 0x7b, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x6e, 0x3d,
+  0x6e, 0x65, 0x77, 0x20, 0x67, 0x28, 0x74, 0x29, 0x3b, 0x74, 0x72, 0x79,
+  0x7b, 0x6e, 0x2e, 0x63, 0x28, 0x29, 0x7d, 0x63, 0x61, 0x74, 0x63, 0x68,
+  0x28, 0x74, 0x29, 0x7b, 0x6e, 0x2e, 0x64, 0x28, 0x29, 0x3b, 0x74, 0x68,
+  0x72, 0x6f, 0x77, 0x20, 0x74, 0x7d, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e,
+  0x20, 0x6e, 0x2e, 0x64, 0x2e, 0x62, 0x69, 0x6e, 0x64, 0x28, 0x6e, 0x29,
+  0x7d, 0x76, 0x61, 0x72, 0x20, 0x6b, 0x2c, 0x53, 0x2c, 0x78, 0x2c, 0x77,
+  0x2c, 0x43, 0x2c, 0x45, 0x2c, 0x55, 0x2c, 0x48, 0x2c, 0x4e, 0x2c, 0x50,
+  0x3d, 0x7b, 0x7d, 0x2c, 0x44, 0x3d, 0x5b, 0x5d, 0x2c, 0x24, 0x3d, 0x2f,
+  0x61, 0x63, 0x69, 0x74, 0x7c, 0x65, 0x78, 0x28, 0x3f, 0x3a, 0x73, 0x7c,
+  0x67, 0x7c, 0x6e, 0x7c, 0x70, 0x7c, 0x24, 0x29, 0x7c, 0x72, 0x70, 0x68,
+  0x7c, 0x67, 0x72, 0x69, 0x64, 0x7c, 0x6f, 0x77, 0x73, 0x7c, 0x6d, 0x6e,
+  0x63, 0x7c, 0x6e, 0x74, 0x77, 0x7c, 0x69, 0x6e, 0x65, 0x5b, 0x63, 0x68,
+  0x5d, 0x7c, 0x7a, 0x6f, 0x6f, 0x7c, 0x5e, 0x6f, 0x72, 0x64, 0x7c, 0x69,
+  0x74, 0x65, 0x72, 0x61, 0x2f, 0x69, 0x2c, 0x54, 0x3d, 0x41, 0x72, 0x72,
+  0x61, 0x79, 0x2e, 0x69, 0x73, 0x41, 0x72, 0x72, 0x61, 0x79, 0x3b, 0x66,
+  0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x56, 0x28, 0x74, 0x2c,
+  0x6e, 0x29, 0x7b, 0x66, 0x6f, 0x72, 0x28, 0x76, 0x61, 0x72, 0x20, 0x65,
+  0x20, 0x69, 0x6e, 0x20, 0x6e, 0x29, 0x74, 0x5b, 0x65, 0x5d, 0x3d, 0x6e,
+  0x5b, 0x65, 0x5d, 0x3b, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x74,
+  0x7d, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x41, 0x28,
+  0x74, 0x29, 0x7b, 0x76, 0x61, 0x72, 0x20, 0x6e, 0x3d, 0x74, 0x2e, 0x70,
+  0x61, 0x72, 0x65, 0x6e, 0x74, 0x4e, 0x6f, 0x64, 0x65, 0x3b, 0x6e, 0x26,
+  0x26, 0x6e, 0x2e, 0x72, 0x65, 0x6d, 0x6f, 0x76, 0x65, 0x43, 0x68, 0x69,
+  0x6c, 0x64, 0x28, 0x74, 0x29, 0x7d, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69,
+  0x6f, 0x6e, 0x20, 0x46, 0x28, 0x74, 0x2c, 0x6e, 0x2c, 0x65, 0x29, 0x7b,
+  0x76, 0x61, 0x72, 0x20, 0x69, 0x2c, 0x5f, 0x2c, 0x6f, 0x2c, 0x72, 0x3d,
+  0x7b, 0x7d, 0x3b, 0x66, 0x6f, 0x72, 0x28, 0x6f, 0x20, 0x69, 0x6e, 0x20,
+  0x6e, 0x29, 0x22, 0x6b, 0x65, 0x79, 0x22, 0x3d, 0x3d, 0x6f, 0x3f, 0x69,
+  0x3d, 0x6e, 0x5b, 0x6f, 0x5d, 0x3a, 0x22, 0x72, 0x65, 0x66, 0x22, 0x3d,
+  0x3d, 0x6f, 0x3f, 0x5f, 0x3d, 0x6e, 0x5b, 0x6f, 0x5d, 0x3a, 0x72, 0x5b,
+  0x6f, 0x5d, 0x3d, 0x6e, 0x5b, 0x6f, 0x5d, 0x3b, 0x69, 0x66, 0x28, 0x61,
+  0x72, 0x67, 0x75, 0x6d, 0x65, 0x6e, 0x74, 0x73, 0x2e, 0x6c, 0x65, 0x6e,
+  0x67, 0x74, 0x68, 0x3e, 0x32, 0x26, 0x26, 0x28, 0x72, 0x2e, 0x63, 0x68,
+  0x69, 0x6c, 0x64, 0x72, 0x65, 0x6e, 0x3d, 0x61, 0x72, 0x67, 0x75, 0x6d,
+  0x65, 0x6e, 0x74, 0x73, 0x2e, 0x6c, 0x65, 0x6e, 0x67, 0x74, 0x68, 0x3e,
+  0x33, 0x3f, 0x6b, 0x2e, 0x63, 0x61, 0x6c, 0x6c, 0x28, 0x61, 0x72, 0x67,
+  0x75, 0x6d, 0x65, 0x6e, 0x74, 0x73, 0x2c, 0x32, 0x29, 0x3a, 0x65, 0x29,
+  0x2c, 0x22, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x22, 0x3d,
+  0x3d, 0x74, 0x79, 0x70, 0x65, 0x6f, 0x66, 0x20, 0x74, 0x26, 0x26, 0x6e,
+  0x75, 0x6c, 0x6c, 0x21, 0x3d, 0x74, 0x2e, 0x64, 0x65, 0x66, 0x61, 0x75,
+  0x6c, 0x74, 0x50, 0x72, 0x6f, 0x70, 0x73, 0x29, 0x66, 0x6f, 0x72, 0x28,
+  0x6f, 0x20, 0x69, 0x6e, 0x20, 0x74, 0x2e, 0x64, 0x65, 0x66, 0x61, 0x75,
+  0x6c, 0x74, 0x50, 0x72, 0x6f, 0x70, 0x73, 0x29, 0x76, 0x6f, 0x69, 0x64,
+  0x20, 0x30, 0x3d, 0x3d, 0x3d, 0x72, 0x5b, 0x6f, 0x5d, 0x26, 0x26, 0x28,
+  0x72, 0x5b, 0x6f, 0x5d, 0x3d, 0x74, 0x2e, 0x64, 0x65, 0x66, 0x61, 0x75,
+  0x6c, 0x74, 0x50, 0x72, 0x6f, 0x70, 0x73, 0x5b, 0x6f, 0x5d, 0x29, 0x3b,
+  0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x4d, 0x28, 0x74, 0x2c, 0x72,
+  0x2c, 0x69, 0x2c, 0x5f, 0x2c, 0x6e, 0x75, 0x6c, 0x6c, 0x29, 0x7d, 0x66,
+  0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x4d, 0x28, 0x74, 0x2c,
+  0x6e, 0x2c, 0x65, 0x2c, 0x69, 0x2c, 0x5f, 0x29, 0x7b, 0x76, 0x61, 0x72,
+  0x20, 0x6f, 0x3d, 0x7b, 0x74, 0x79, 0x70, 0x65, 0x3a, 0x74, 0x2c, 0x70,
+  0x72, 0x6f, 0x70, 0x73, 0x3a, 0x6e, 0x2c, 0x6b, 0x65, 0x79, 0x3a, 0x65,
+  0x2c, 0x72, 0x65, 0x66, 0x3a, 0x69, 0x2c, 0x5f, 0x5f, 0x6b, 0x3a, 0x6e,
+  0x75, 0x6c, 0x6c, 0x2c, 0x5f, 0x5f, 0x3a, 0x6e, 0x75, 0x6c, 0x6c, 0x2c,
+  0x5f, 0x5f, 0x62, 0x3a, 0x30, 0x2c, 0x5f, 0x5f, 0x65, 0x3a, 0x6e, 0x75,
+  0x6c, 0x6c, 0x2c, 0x5f, 0x5f, 0x64, 0x3a, 0x76, 0x6f, 0x69, 0x64, 0x20,
+  0x30, 0x2c, 0x5f, 0x5f, 0x63, 0x3a, 0x6e, 0x75, 0x6c, 0x6c, 0x2c, 0x5f,
+  0x5f, 0x68, 0x3a, 0x6e, 0x75, 0x6c, 0x6c, 0x2c, 0x63, 0x6f, 0x6e, 0x73,
+  0x74, 0x72, 0x75, 0x63, 0x74, 0x6f, 0x72, 0x3a, 0x76, 0x6f, 0x69, 0x64,
+  0x20, 0x30, 0x2c, 0x5f, 0x5f, 0x76, 0x3a, 0x6e, 0x75, 0x6c, 0x6c, 0x3d,
+  0x3d, 0x5f, 0x3f, 0x2b, 0x2b, 0x78, 0x3a, 0x5f, 0x7d, 0x3b, 0x72, 0x65,
+  0x74, 0x75, 0x72, 0x6e, 0x20, 0x6e, 0x75, 0x6c, 0x6c, 0x3d, 0x3d, 0x5f,
+  0x26, 0x26, 0x6e, 0x75, 0x6c, 0x6c, 0x21, 0x3d, 0x53, 0x2e, 0x76, 0x6e,
+  0x6f, 0x64, 0x65, 0x26, 0x26, 0x53, 0x2e, 0x76, 0x6e, 0x6f, 0x64, 0x65,
+  0x28, 0x6f, 0x29, 0x2c, 0x6f, 0x7d, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69,
+  0x6f, 0x6e, 0x20, 0x57, 0x28, 0x29, 0x7b, 0x72, 0x65, 0x74, 0x75, 0x72,
+  0x6e, 0x7b, 0x63, 0x75, 0x72, 0x72, 0x65, 0x6e, 0x74, 0x3a, 0x6e, 0x75,
+  0x6c, 0x6c, 0x7d, 0x7d, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e,
+  0x20, 0x4f, 0x28, 0x74, 0x29, 0x7b, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e,
+  0x20, 0x74, 0x2e, 0x63, 0x68, 0x69, 0x6c, 0x64, 0x72, 0x65, 0x6e, 0x7d,
+  0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x4c, 0x28, 0x74,
+  0x2c, 0x6e, 0x29, 0x7b, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x70, 0x72, 0x6f,
+  0x70, 0x73, 0x3d, 0x74, 0x2c, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x63, 0x6f,
+  0x6e, 0x74, 0x65, 0x78, 0x74, 0x3d, 0x6e, 0x7d, 0x66, 0x75, 0x6e, 0x63,
+  0x74, 0x69, 0x6f, 0x6e, 0x20, 0x52, 0x28, 0x74, 0x2c, 0x6e, 0x29, 0x7b,
+  0x69, 0x66, 0x28, 0x6e, 0x75, 0x6c, 0x6c, 0x3d, 0x3d, 0x6e, 0x29, 0x72,
+  0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x74, 0x2e, 0x5f, 0x5f, 0x3f, 0x52,
+  0x28, 0x74, 0x2e, 0x5f, 0x5f, 0x2c, 0x74, 0x2e, 0x5f, 0x5f, 0x2e, 0x5f,
+  0x5f, 0x6b, 0x2e, 0x69, 0x6e, 0x64, 0x65, 0x78, 0x4f, 0x66, 0x28, 0x74,
+  0x29, 0x2b, 0x31, 0x29, 0x3a, 0x6e, 0x75, 0x6c, 0x6c, 0x3b, 0x66, 0x6f,
+  0x72, 0x28, 0x76, 0x61, 0x72, 0x20, 0x65, 0x3b, 0x6e, 0x3c, 0x74, 0x2e,
+  0x5f, 0x5f, 0x6b, 0x2e, 0x6c, 0x65, 0x6e, 0x67, 0x74, 0x68, 0x3b, 0x6e,
+  0x2b, 0x2b, 0x29, 0x69, 0x66, 0x28, 0x6e, 0x75, 0x6c, 0x6c, 0x21, 0x3d,
+  0x28, 0x65, 0x3d, 0x74, 0x2e, 0x5f, 0x5f, 0x6b, 0x5b, 0x6e, 0x5d, 0x29,
+  0x26, 0x26, 0x6e, 0x75, 0x6c, 0x6c, 0x21, 0x3d, 0x65, 0x2e, 0x5f, 0x5f,
+  0x65, 0x29, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x65, 0x2e, 0x5f,
+  0x5f, 0x65, 0x3b, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x22, 0x66, 0x75,
+  0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x22, 0x3d, 0x3d, 0x74, 0x79, 0x70,
+  0x65, 0x6f, 0x66, 0x20, 0x74, 0x2e, 0x74, 0x79, 0x70, 0x65, 0x3f, 0x52,
+  0x28, 0x74, 0x29, 0x3a, 0x6e, 0x75, 0x6c, 0x6c, 0x7d, 0x66, 0x75, 0x6e,
+  0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x49, 0x28, 0x74, 0x29, 0x7b, 0x76,
+  0x61, 0x72, 0x20, 0x6e, 0x2c, 0x65, 0x3b, 0x69, 0x66, 0x28, 0x6e, 0x75,
+  0x6c, 0x6c, 0x21, 0x3d, 0x28, 0x74, 0x3d, 0x74, 0x2e, 0x5f, 0x5f, 0x29,
+  0x26, 0x26, 0x6e, 0x75, 0x6c, 0x6c, 0x21, 0x3d, 0x74, 0x2e, 0x5f, 0x5f,
+  0x63, 0x29, 0x7b, 0x66, 0x6f, 0x72, 0x28, 0x74, 0x2e, 0x5f, 0x5f, 0x65,
+  0x3d, 0x74, 0x2e, 0x5f, 0x5f, 0x63, 0x2e, 0x62, 0x61, 0x73, 0x65, 0x3d,
+  0x6e, 0x75, 0x6c, 0x6c, 0x2c, 0x6e, 0x3d, 0x30, 0x3b, 0x6e, 0x3c, 0x74,
+  0x2e, 0x5f, 0x5f, 0x6b, 0x2e, 0x6c, 0x65, 0x6e, 0x67, 0x74, 0x68, 0x3b,
+  0x6e, 0x2b, 0x2b, 0x29, 0x69, 0x66, 0x28, 0x6e, 0x75, 0x6c, 0x6c, 0x21,
+  0x3d, 0x28, 0x65, 0x3d, 0x74, 0x2e, 0x5f, 0x5f, 0x6b, 0x5b, 0x6e, 0x5d,
+  0x29, 0x26, 0x26, 0x6e, 0x75, 0x6c, 0x6c, 0x21, 0x3d, 0x65, 0x2e, 0x5f,
+  0x5f, 0x65, 0x29, 0x7b, 0x74, 0x2e, 0x5f, 0x5f, 0x65, 0x3d, 0x74, 0x2e,
+  0x5f, 0x5f, 0x63, 0x2e, 0x62, 0x61, 0x73, 0x65, 0x3d, 0x65, 0x2e, 0x5f,
+  0x5f, 0x65, 0x3b, 0x62, 0x72, 0x65, 0x61, 0x6b, 0x7d, 0x72, 0x65, 0x74,
+  0x75, 0x72, 0x6e, 0x20, 0x49, 0x28, 0x74, 0x29, 0x7d, 0x7d, 0x66, 0x75,
+  0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x6a, 0x28, 0x74, 0x29, 0x7b,
+  0x28, 0x21, 0x74, 0x2e, 0x5f, 0x5f, 0x64, 0x26, 0x26, 0x28, 0x74, 0x2e,
+  0x5f, 0x5f, 0x64, 0x3d, 0x21, 0x30, 0x29, 0x26, 0x26, 0x43, 0x2e, 0x70,
+  0x75, 0x73, 0x68, 0x28, 0x74, 0x29, 0x26, 0x26, 0x21, 0x71, 0x2e, 0x5f,
+  0x5f, 0x72, 0x2b, 0x2b, 0x7c, 0x7c, 0x45, 0x21, 0x3d, 0x3d, 0x53, 0x2e,
+  0x64, 0x65, 0x62, 0x6f, 0x75, 0x6e, 0x63, 0x65, 0x52, 0x65, 0x6e, 0x64,
+  0x65, 0x72, 0x69, 0x6e, 0x67, 0x29, 0x26, 0x26, 0x28, 0x28, 0x45, 0x3d,
+  0x53, 0x2e, 0x64, 0x65, 0x62, 0x6f, 0x75, 0x6e, 0x63, 0x65, 0x52, 0x65,
+  0x6e, 0x64, 0x65, 0x72, 0x69, 0x6e, 0x67, 0x29, 0x7c, 0x7c, 0x55, 0x29,
+  0x28, 0x71, 0x29, 0x7d, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e,
+  0x20, 0x71, 0x28, 0x29, 0x7b, 0x76, 0x61, 0x72, 0x20, 0x74, 0x2c, 0x6e,
+  0x2c, 0x65, 0x2c, 0x69, 0x2c, 0x5f, 0x2c, 0x6f, 0x2c, 0x72, 0x2c, 0x75,
+  0x3b, 0x66, 0x6f, 0x72, 0x28, 0x43, 0x2e, 0x73, 0x6f, 0x72, 0x74, 0x28,
+  0x48, 0x29, 0x3b, 0x74, 0x3d, 0x43, 0x2e, 0x73, 0x68, 0x69, 0x66, 0x74,
+  0x28, 0x29, 0x3b, 0x29, 0x74, 0x2e, 0x5f, 0x5f, 0x64, 0x26, 0x26, 0x28,
+  0x6e, 0x3d, 0x43, 0x2e, 0x6c, 0x65, 0x6e, 0x67, 0x74, 0x68, 0x2c, 0x69,
+  0x3d, 0x76, 0x6f, 0x69, 0x64, 0x20, 0x30, 0x2c, 0x5f, 0x3d, 0x76, 0x6f,
+  0x69, 0x64, 0x20, 0x30, 0x2c, 0x72, 0x3d, 0x28, 0x6f, 0x3d, 0x28, 0x65,
+  0x3d, 0x74, 0x29, 0x2e, 0x5f, 0x5f, 0x76, 0x29, 0x2e, 0x5f, 0x5f, 0x65,
+  0x2c, 0x28, 0x75, 0x3d, 0x65, 0x2e, 0x5f, 0x5f, 0x50, 0x29, 0x26, 0x26,
+  0x28, 0x69, 0x3d, 0x5b, 0x5d, 0x2c, 0x28, 0x5f, 0x3d, 0x56, 0x28, 0x7b,
+  0x7d, 0x2c, 0x6f, 0x29, 0x29, 0x2e, 0x5f, 0x5f, 0x76, 0x3d, 0x6f, 0x2e,
+  0x5f, 0x5f, 0x76, 0x2b, 0x31, 0x2c, 0x6e, 0x74, 0x28, 0x75, 0x2c, 0x6f,
+  0x2c, 0x5f, 0x2c, 0x65, 0x2e, 0x5f, 0x5f, 0x6e, 0x2c, 0x76, 0x6f, 0x69,
+  0x64, 0x20, 0x30, 0x21, 0x3d, 0x3d, 0x75, 0x2e, 0x6f, 0x77, 0x6e, 0x65,
+  0x72, 0x53, 0x56, 0x47, 0x45, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x2c,
+  0x6e, 0x75, 0x6c, 0x6c, 0x21, 0x3d, 0x6f, 0x2e, 0x5f, 0x5f, 0x68, 0x3f,
+  0x5b, 0x72, 0x5d, 0x3a, 0x6e, 0x75, 0x6c, 0x6c, 0x2c, 0x69, 0x2c, 0x6e,
+  0x75, 0x6c, 0x6c, 0x3d, 0x3d, 0x72, 0x3f, 0x52, 0x28, 0x6f, 0x29, 0x3a,
+  0x72, 0x2c, 0x6f, 0x2e, 0x5f, 0x5f, 0x68, 0x29, 0x2c, 0x65, 0x74, 0x28,
+  0x69, 0x2c, 0x6f, 0x29, 0x2c, 0x6f, 0x2e, 0x5f, 0x5f, 0x65, 0x21, 0x3d,
+  0x72, 0x26, 0x26, 0x49, 0x28, 0x6f, 0x29, 0x29, 0x2c, 0x43, 0x2e, 0x6c,
+  0x65, 0x6e, 0x67, 0x74, 0x68, 0x3e, 0x6e, 0x26, 0x26, 0x43, 0x2e, 0x73,
+  0x6f, 0x72, 0x74, 0x28, 0x48, 0x29, 0x29, 0x3b, 0x71, 0x2e, 0x5f, 0x5f,
+  0x72, 0x3d, 0x30, 0x7d, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e,
+  0x20, 0x42, 0x28, 0x74, 0x2c, 0x6e, 0x2c, 0x65, 0x2c, 0x69, 0x2c, 0x5f,
+  0x2c, 0x6f, 0x2c, 0x72, 0x2c, 0x75, 0x2c, 0x6c, 0x2c, 0x66, 0x29, 0x7b,
+  0x76, 0x61, 0x72, 0x20, 0x73, 0x2c, 0x63, 0x2c, 0x68, 0x2c, 0x61, 0x2c,
+  0x70, 0x2c, 0x64, 0x2c, 0x76, 0x2c, 0x79, 0x3d, 0x69, 0x26, 0x26, 0x69,
+  0x2e, 0x5f, 0x5f, 0x6b, 0x7c, 0x7c, 0x44, 0x2c, 0x6d, 0x3d, 0x79, 0x2e,
+  0x6c, 0x65, 0x6e, 0x67, 0x74, 0x68, 0x3b, 0x66, 0x6f, 0x72, 0x28, 0x65,
+  0x2e, 0x5f, 0x5f, 0x6b, 0x3d, 0x5b, 0x5d, 0x2c, 0x73, 0x3d, 0x30, 0x3b,
+  0x73, 0x3c, 0x6e, 0x2e, 0x6c, 0x65, 0x6e, 0x67, 0x74, 0x68, 0x3b, 0x73,
+  0x2b, 0x2b, 0x29, 0x69, 0x66, 0x28, 0x6e, 0x75, 0x6c, 0x6c, 0x21, 0x3d,
+  0x28, 0x61, 0x3d, 0x65, 0x2e, 0x5f, 0x5f, 0x6b, 0x5b, 0x73, 0x5d, 0x3d,
+  0x6e, 0x75, 0x6c, 0x6c, 0x3d, 0x3d, 0x28, 0x61, 0x3d, 0x6e, 0x5b, 0x73,
+  0x5d, 0x29, 0x7c, 0x7c, 0x22, 0x62, 0x6f, 0x6f, 0x6c, 0x65, 0x61, 0x6e,
+  0x22, 0x3d, 0x3d, 0x74, 0x79, 0x70, 0x65, 0x6f, 0x66, 0x20, 0x61, 0x7c,
+  0x7c, 0x22, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x22, 0x3d,
+  0x3d, 0x74, 0x79, 0x70, 0x65, 0x6f, 0x66, 0x20, 0x61, 0x3f, 0x6e, 0x75,
+  0x6c, 0x6c, 0x3a, 0x22, 0x73, 0x74, 0x72, 0x69, 0x6e, 0x67, 0x22, 0x3d,
+  0x3d, 0x74, 0x79, 0x70, 0x65, 0x6f, 0x66, 0x20, 0x61, 0x7c, 0x7c, 0x22,
+  0x6e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x22, 0x3d, 0x3d, 0x74, 0x79, 0x70,
+  0x65, 0x6f, 0x66, 0x20, 0x61, 0x7c, 0x7c, 0x22, 0x62, 0x69, 0x67, 0x69,
+  0x6e, 0x74, 0x22, 0x3d, 0x3d, 0x74, 0x79, 0x70, 0x65, 0x6f, 0x66, 0x20,
+  0x61, 0x3f, 0x4d, 0x28, 0x6e, 0x75, 0x6c, 0x6c, 0x2c, 0x61, 0x2c, 0x6e,
+  0x75, 0x6c, 0x6c, 0x2c, 0x6e, 0x75, 0x6c, 0x6c, 0x2c, 0x61, 0x29, 0x3a,
+  0x54, 0x28, 0x61, 0x29, 0x3f, 0x4d, 0x28, 0x4f, 0x2c, 0x7b, 0x63, 0x68,
+  0x69, 0x6c, 0x64, 0x72, 0x65, 0x6e, 0x3a, 0x61, 0x7d, 0x2c, 0x6e, 0x75,
+  0x6c, 0x6c, 0x2c, 0x6e, 0x75, 0x6c, 0x6c, 0x2c, 0x6e, 0x75, 0x6c, 0x6c,
+  0x29, 0x3a, 0x61, 0x2e, 0x5f, 0x5f, 0x62, 0x3e, 0x30, 0x3f, 0x4d, 0x28,
+  0x61, 0x2e, 0x74, 0x79, 0x70, 0x65, 0x2c, 0x61, 0x2e, 0x70, 0x72, 0x6f,
+  0x70, 0x73, 0x2c, 0x61, 0x2e, 0x6b, 0x65, 0x79, 0x2c, 0x61, 0x2e, 0x72,
+  0x65, 0x66, 0x3f, 0x61, 0x2e, 0x72, 0x65, 0x66, 0x3a, 0x6e, 0x75, 0x6c,
+  0x6c, 0x2c, 0x61, 0x2e, 0x5f, 0x5f, 0x76, 0x29, 0x3a, 0x61, 0x29, 0x29,
+  0x7b, 0x69, 0x66, 0x28, 0x61, 0x2e, 0x5f, 0x5f, 0x3d, 0x65, 0x2c, 0x61,
+  0x2e, 0x5f, 0x5f, 0x62, 0x3d, 0x65, 0x2e, 0x5f, 0x5f, 0x62, 0x2b, 0x31,
+  0x2c, 0x6e, 0x75, 0x6c, 0x6c, 0x3d, 0x3d, 0x3d, 0x28, 0x68, 0x3d, 0x79,
+  0x5b, 0x73, 0x5d, 0x29, 0x7c, 0x7c, 0x68, 0x26, 0x26, 0x61, 0x2e, 0x6b,
+  0x65, 0x79, 0x3d, 0x3d, 0x68, 0x2e, 0x6b, 0x65, 0x79, 0x26, 0x26, 0x61,
+  0x2e, 0x74, 0x79, 0x70, 0x65, 0x3d, 0x3d, 0x3d, 0x68, 0x2e, 0x74, 0x79,
+  0x70, 0x65, 0x29, 0x79, 0x5b, 0x73, 0x5d, 0x3d, 0x76, 0x6f, 0x69, 0x64,
+  0x20, 0x30, 0x3b, 0x65, 0x6c, 0x73, 0x65, 0x20, 0x66, 0x6f, 0x72, 0x28,
+  0x63, 0x3d, 0x30, 0x3b, 0x63, 0x3c, 0x6d, 0x3b, 0x63, 0x2b, 0x2b, 0x29,
+  0x7b, 0x69, 0x66, 0x28, 0x28, 0x68, 0x3d, 0x79, 0x5b, 0x63, 0x5d, 0x29,
+  0x26, 0x26, 0x61, 0x2e, 0x6b, 0x65, 0x79, 0x3d, 0x3d, 0x68, 0x2e, 0x6b,
+  0x65, 0x79, 0x26, 0x26, 0x61, 0x2e, 0x74, 0x79, 0x70, 0x65, 0x3d, 0x3d,
+  0x3d, 0x68, 0x2e, 0x74, 0x79, 0x70, 0x65, 0x29, 0x7b, 0x79, 0x5b, 0x63,
+  0x5d, 0x3d, 0x76, 0x6f, 0x69, 0x64, 0x20, 0x30, 0x3b, 0x62, 0x72, 0x65,
+  0x61, 0x6b, 0x7d, 0x68, 0x3d, 0x6e, 0x75, 0x6c, 0x6c, 0x7d, 0x6e, 0x74,
+  0x28, 0x74, 0x2c, 0x61, 0x2c, 0x68, 0x3d, 0x68, 0x7c, 0x7c, 0x50, 0x2c,
+  0x5f, 0x2c, 0x6f, 0x2c, 0x72, 0x2c, 0x75, 0x2c, 0x6c, 0x2c, 0x66, 0x29,
+  0x2c, 0x70, 0x3d, 0x61, 0x2e, 0x5f, 0x5f, 0x65, 0x2c, 0x28, 0x63, 0x3d,
+  0x61, 0x2e, 0x72, 0x65, 0x66, 0x29, 0x26, 0x26, 0x68, 0x2e, 0x72, 0x65,
+  0x66, 0x21, 0x3d, 0x63, 0x26, 0x26, 0x28, 0x76, 0x7c, 0x7c, 0x28, 0x76,
+  0x3d, 0x5b, 0x5d, 0x29, 0x2c, 0x68, 0x2e, 0x72, 0x65, 0x66, 0x26, 0x26,
+  0x76, 0x2e, 0x70, 0x75, 0x73, 0x68, 0x28, 0x68, 0x2e, 0x72, 0x65, 0x66,
+  0x2c, 0x6e, 0x75, 0x6c, 0x6c, 0x2c, 0x61, 0x29, 0x2c, 0x76, 0x2e, 0x70,
+  0x75, 0x73, 0x68, 0x28, 0x63, 0x2c, 0x61, 0x2e, 0x5f, 0x5f, 0x63, 0x7c,
+  0x7c, 0x70, 0x2c, 0x61, 0x29, 0x29, 0x2c, 0x6e, 0x75, 0x6c, 0x6c, 0x21,
+  0x3d, 0x70, 0x3f, 0x28, 0x6e, 0x75, 0x6c, 0x6c, 0x3d, 0x3d, 0x64, 0x26,
+  0x26, 0x28, 0x64, 0x3d, 0x70, 0x29, 0x2c, 0x22, 0x66, 0x75, 0x6e, 0x63,
+  0x74, 0x69, 0x6f, 0x6e, 0x22, 0x3d, 0x3d, 0x74, 0x79, 0x70, 0x65, 0x6f,
+  0x66, 0x20, 0x61, 0x2e, 0x74, 0x79, 0x70, 0x65, 0x26, 0x26, 0x61, 0x2e,
+  0x5f, 0x5f, 0x6b, 0x3d, 0x3d, 0x3d, 0x68, 0x2e, 0x5f, 0x5f, 0x6b, 0x3f,
+  0x61, 0x2e, 0x5f, 0x5f, 0x64, 0x3d, 0x6c, 0x3d, 0x47, 0x28, 0x61, 0x2c,
+  0x6c, 0x2c, 0x74, 0x29, 0x3a, 0x6c, 0x3d, 0x4a, 0x28, 0x74, 0x2c, 0x61,
+  0x2c, 0x68, 0x2c, 0x79, 0x2c, 0x70, 0x2c, 0x6c, 0x29, 0x2c, 0x22, 0x66,
+  0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x22, 0x3d, 0x3d, 0x74, 0x79,
+  0x70, 0x65, 0x6f, 0x66, 0x20, 0x65, 0x2e, 0x74, 0x79, 0x70, 0x65, 0x26,
+  0x26, 0x28, 0x65, 0x2e, 0x5f, 0x5f, 0x64, 0x3d, 0x6c, 0x29, 0x29, 0x3a,
+  0x6c, 0x26, 0x26, 0x68, 0x2e, 0x5f, 0x5f, 0x65, 0x3d, 0x3d, 0x6c, 0x26,
+  0x26, 0x6c, 0x2e, 0x70, 0x61, 0x72, 0x65, 0x6e, 0x74, 0x4e, 0x6f, 0x64,
+  0x65, 0x21, 0x3d, 0x74, 0x26, 0x26, 0x28, 0x6c, 0x3d, 0x52, 0x28, 0x68,
+  0x29, 0x29, 0x7d, 0x66, 0x6f, 0x72, 0x28, 0x65, 0x2e, 0x5f, 0x5f, 0x65,
+  0x3d, 0x64, 0x2c, 0x73, 0x3d, 0x6d, 0x3b, 0x73, 0x2d, 0x2d, 0x3b, 0x29,
+  0x6e, 0x75, 0x6c, 0x6c, 0x21, 0x3d, 0x79, 0x5b, 0x73, 0x5d, 0x26, 0x26,
+  0x28, 0x22, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x22, 0x3d,
+  0x3d, 0x74, 0x79, 0x70, 0x65, 0x6f, 0x66, 0x20, 0x65, 0x2e, 0x74, 0x79,
+  0x70, 0x65, 0x26, 0x26, 0x6e, 0x75, 0x6c, 0x6c, 0x21, 0x3d, 0x79, 0x5b,
+  0x73, 0x5d, 0x2e, 0x5f, 0x5f, 0x65, 0x26, 0x26, 0x79, 0x5b, 0x73, 0x5d,
+  0x2e, 0x5f, 0x5f, 0x65, 0x3d, 0x3d, 0x65, 0x2e, 0x5f, 0x5f, 0x64, 0x26,
+  0x26, 0x28, 0x65, 0x2e, 0x5f, 0x5f, 0x64, 0x3d, 0x4b, 0x28, 0x69, 0x29,
+  0x2e, 0x6e, 0x65, 0x78, 0x74, 0x53, 0x69, 0x62, 0x6c, 0x69, 0x6e, 0x67,
+  0x29, 0x2c, 0x6f, 0x74, 0x28, 0x79, 0x5b, 0x73, 0x5d, 0x2c, 0x79, 0x5b,
+  0x73, 0x5d, 0x29, 0x29, 0x3b, 0x69, 0x66, 0x28, 0x76, 0x29, 0x66, 0x6f,
+  0x72, 0x28, 0x73, 0x3d, 0x30, 0x3b, 0x73, 0x3c, 0x76, 0x2e, 0x6c, 0x65,
+  0x6e, 0x67, 0x74, 0x68, 0x3b, 0x73, 0x2b, 0x2b, 0x29, 0x5f, 0x74, 0x28,
+  0x76, 0x5b, 0x73, 0x5d, 0x2c, 0x76, 0x5b, 0x2b, 0x2b, 0x73, 0x5d, 0x2c,
+  0x76, 0x5b, 0x2b, 0x2b, 0x73, 0x5d, 0x29, 0x7d, 0x66, 0x75, 0x6e, 0x63,
+  0x74, 0x69, 0x6f, 0x6e, 0x20, 0x47, 0x28, 0x74, 0x2c, 0x6e, 0x2c, 0x65,
+  0x29, 0x7b, 0x66, 0x6f, 0x72, 0x28, 0x76, 0x61, 0x72, 0x20, 0x69, 0x2c,
+  0x5f, 0x3d, 0x74, 0x2e, 0x5f, 0x5f, 0x6b, 0x2c, 0x6f, 0x3d, 0x30, 0x3b,
+  0x5f, 0x26, 0x26, 0x6f, 0x3c, 0x5f, 0x2e, 0x6c, 0x65, 0x6e, 0x67, 0x74,
+  0x68, 0x3b, 0x6f, 0x2b, 0x2b, 0x29, 0x28, 0x69, 0x3d, 0x5f, 0x5b, 0x6f,
+  0x5d, 0x29, 0x26, 0x26, 0x28, 0x69, 0x2e, 0x5f, 0x5f, 0x3d, 0x74, 0x2c,
+  0x6e, 0x3d, 0x22, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x22,
+  0x3d, 0x3d, 0x74, 0x79, 0x70, 0x65, 0x6f, 0x66, 0x20, 0x69, 0x2e, 0x74,
+  0x79, 0x70, 0x65, 0x3f, 0x47, 0x28, 0x69, 0x2c, 0x6e, 0x2c, 0x65, 0x29,
+  0x3a, 0x4a, 0x28, 0x65, 0x2c, 0x69, 0x2c, 0x69, 0x2c, 0x5f, 0x2c, 0x69,
+  0x2e, 0x5f, 0x5f, 0x65, 0x2c, 0x6e, 0x29, 0x29, 0x3b, 0x72, 0x65, 0x74,
+  0x75, 0x72, 0x6e, 0x20, 0x6e, 0x7d, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69,
+  0x6f, 0x6e, 0x20, 0x7a, 0x28, 0x74, 0x2c, 0x6e, 0x29, 0x7b, 0x72, 0x65,
+  0x74, 0x75, 0x72, 0x6e, 0x20, 0x6e, 0x3d, 0x6e, 0x7c, 0x7c, 0x5b, 0x5d,
+  0x2c, 0x6e, 0x75, 0x6c, 0x6c, 0x3d, 0x3d, 0x74, 0x7c, 0x7c, 0x22, 0x62,
+  0x6f, 0x6f, 0x6c, 0x65, 0x61, 0x6e, 0x22, 0x3d, 0x3d, 0x74, 0x79, 0x70,
+  0x65, 0x6f, 0x66, 0x20, 0x74, 0x7c, 0x7c, 0x28, 0x54, 0x28, 0x74, 0x29,
+  0x3f, 0x74, 0x2e, 0x73, 0x6f, 0x6d, 0x65, 0x28, 0x28, 0x66, 0x75, 0x6e,
+  0x63, 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x74, 0x29, 0x7b, 0x7a, 0x28, 0x74,
+  0x2c, 0x6e, 0x29, 0x7d, 0x29, 0x29, 0x3a, 0x6e, 0x2e, 0x70, 0x75, 0x73,
+  0x68, 0x28, 0x74, 0x29, 0x29, 0x2c, 0x6e, 0x7d, 0x66, 0x75, 0x6e, 0x63,
+  0x74, 0x69, 0x6f, 0x6e, 0x20, 0x4a, 0x28, 0x74, 0x2c, 0x6e, 0x2c, 0x65,
+  0x2c, 0x69, 0x2c, 0x5f, 0x2c, 0x6f, 0x29, 0x7b, 0x76, 0x61, 0x72, 0x20,
+  0x72, 0x2c, 0x75, 0x2c, 0x6c, 0x3b, 0x69, 0x66, 0x28, 0x76, 0x6f, 0x69,
+  0x64, 0x20, 0x30, 0x21, 0x3d, 0x3d, 0x6e, 0x2e, 0x5f, 0x5f, 0x64, 0x29,
+  0x72, 0x3d, 0x6e, 0x2e, 0x5f, 0x5f, 0x64, 0x2c, 0x6e, 0x2e, 0x5f, 0x5f,
+  0x64, 0x3d, 0x76, 0x6f, 0x69, 0x64, 0x20, 0x30, 0x3b, 0x65, 0x6c, 0x73,
+  0x65, 0x20, 0x69, 0x66, 0x28, 0x6e, 0x75, 0x6c, 0x6c, 0x3d, 0x3d, 0x65,
+  0x7c, 0x7c, 0x5f, 0x21, 0x3d, 0x6f, 0x7c, 0x7c, 0x6e, 0x75, 0x6c, 0x6c,
+  0x3d, 0x3d, 0x5f, 0x2e, 0x70, 0x61, 0x72, 0x65, 0x6e, 0x74, 0x4e, 0x6f,
+  0x64, 0x65, 0x29, 0x74, 0x3a, 0x69, 0x66, 0x28, 0x6e, 0x75, 0x6c, 0x6c,
+  0x3d, 0x3d, 0x6f, 0x7c, 0x7c, 0x6f, 0x2e, 0x70, 0x61, 0x72, 0x65, 0x6e,
+  0x74, 0x4e, 0x6f, 0x64, 0x65, 0x21, 0x3d, 0x3d, 0x74, 0x29, 0x74, 0x2e,
+  0x61, 0x70, 0x70, 0x65, 0x6e, 0x64, 0x43, 0x68, 0x69, 0x6c, 0x64, 0x28,
+  0x5f, 0x29, 0x2c, 0x72, 0x3d, 0x6e, 0x75, 0x6c, 0x6c, 0x3b, 0x65, 0x6c,
+  0x73, 0x65, 0x7b, 0x66, 0x6f, 0x72, 0x28, 0x75, 0x3d, 0x6f, 0x2c, 0x6c,
+  0x3d, 0x30, 0x3b, 0x28, 0x75, 0x3d, 0x75, 0x2e, 0x6e, 0x65, 0x78, 0x74,
+  0x53, 0x69, 0x62, 0x6c, 0x69, 0x6e, 0x67, 0x29, 0x26, 0x26, 0x6c, 0x3c,
+  0x69, 0x2e, 0x6c, 0x65, 0x6e, 0x67, 0x74, 0x68, 0x3b, 0x6c, 0x2b, 0x3d,
+  0x31, 0x29, 0x69, 0x66, 0x28, 0x75, 0x3d, 0x3d, 0x5f, 0x29, 0x62, 0x72,
+  0x65, 0x61, 0x6b, 0x20, 0x74, 0x3b, 0x74, 0x2e, 0x69, 0x6e, 0x73, 0x65,
+  0x72, 0x74, 0x42, 0x65, 0x66, 0x6f, 0x72, 0x65, 0x28, 0x5f, 0x2c, 0x6f,
+  0x29, 0x2c, 0x72, 0x3d, 0x6f, 0x7d, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e,
+  0x20, 0x76, 0x6f, 0x69, 0x64, 0x20, 0x30, 0x21, 0x3d, 0x3d, 0x72, 0x3f,
+  0x72, 0x3a, 0x5f, 0x2e, 0x6e, 0x65, 0x78, 0x74, 0x53, 0x69, 0x62, 0x6c,
+  0x69, 0x6e, 0x67, 0x7d, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e,
+  0x20, 0x4b, 0x28, 0x74, 0x29, 0x7b, 0x76, 0x61, 0x72, 0x20, 0x6e, 0x2c,
+  0x65, 0x2c, 0x69, 0x3b, 0x69, 0x66, 0x28, 0x6e, 0x75, 0x6c, 0x6c, 0x3d,
+  0x3d, 0x74, 0x2e, 0x74, 0x79, 0x70, 0x65, 0x7c, 0x7c, 0x22, 0x73, 0x74,
+  0x72, 0x69, 0x6e, 0x67, 0x22, 0x3d, 0x3d, 0x74, 0x79, 0x70, 0x65, 0x6f,
+  0x66, 0x20, 0x74, 0x2e, 0x74, 0x79, 0x70, 0x65, 0x29, 0x72, 0x65, 0x74,
+  0x75, 0x72, 0x6e, 0x20, 0x74, 0x2e, 0x5f, 0x5f, 0x65, 0x3b, 0x69, 0x66,
+  0x28, 0x74, 0x2e, 0x5f, 0x5f, 0x6b, 0x29, 0x66, 0x6f, 0x72, 0x28, 0x6e,
+  0x3d, 0x74, 0x2e, 0x5f, 0x5f, 0x6b, 0x2e, 0x6c, 0x65, 0x6e, 0x67, 0x74,
+  0x68, 0x2d, 0x31, 0x3b, 0x6e, 0x3e, 0x3d, 0x30, 0x3b, 0x6e, 0x2d, 0x2d,
+  0x29, 0x69, 0x66, 0x28, 0x28, 0x65, 0x3d, 0x74, 0x2e, 0x5f, 0x5f, 0x6b,
+  0x5b, 0x6e, 0x5d, 0x29, 0x26, 0x26, 0x28, 0x69, 0x3d, 0x4b, 0x28, 0x65,
+  0x29, 0x29, 0x29, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x69, 0x3b,
+  0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x6e, 0x75, 0x6c, 0x6c, 0x7d,
+  0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x51, 0x28, 0x74,
+  0x2c, 0x6e, 0x2c, 0x65, 0x2c, 0x69, 0x2c, 0x5f, 0x29, 0x7b, 0x76, 0x61,
+  0x72, 0x20, 0x6f, 0x3b, 0x66, 0x6f, 0x72, 0x28, 0x6f, 0x20, 0x69, 0x6e,
+  0x20, 0x65, 0x29, 0x22, 0x63, 0x68, 0x69, 0x6c, 0x64, 0x72, 0x65, 0x6e,
+  0x22, 0x3d, 0x3d, 0x3d, 0x6f, 0x7c, 0x7c, 0x22, 0x6b, 0x65, 0x79, 0x22,
+  0x3d, 0x3d, 0x3d, 0x6f, 0x7c, 0x7c, 0x6f, 0x20, 0x69, 0x6e, 0x20, 0x6e,
+  0x7c, 0x7c, 0x59, 0x28, 0x74, 0x2c, 0x6f, 0x2c, 0x6e, 0x75, 0x6c, 0x6c,
+  0x2c, 0x65, 0x5b, 0x6f, 0x5d, 0x2c, 0x69, 0x29, 0x3b, 0x66, 0x6f, 0x72,
+  0x28, 0x6f, 0x20, 0x69, 0x6e, 0x20, 0x6e, 0x29, 0x5f, 0x26, 0x26, 0x22,
+  0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x22, 0x21, 0x3d, 0x74,
+  0x79, 0x70, 0x65, 0x6f, 0x66, 0x20, 0x6e, 0x5b, 0x6f, 0x5d, 0x7c, 0x7c,
+  0x22, 0x63, 0x68, 0x69, 0x6c, 0x64, 0x72, 0x65, 0x6e, 0x22, 0x3d, 0x3d,
+  0x3d, 0x6f, 0x7c, 0x7c, 0x22, 0x6b, 0x65, 0x79, 0x22, 0x3d, 0x3d, 0x3d,
+  0x6f, 0x7c, 0x7c, 0x22, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x22, 0x3d, 0x3d,
+  0x3d, 0x6f, 0x7c, 0x7c, 0x22, 0x63, 0x68, 0x65, 0x63, 0x6b, 0x65, 0x64,
+  0x22, 0x3d, 0x3d, 0x3d, 0x6f, 0x7c, 0x7c, 0x65, 0x5b, 0x6f, 0x5d, 0x3d,
+  0x3d, 0x3d, 0x6e, 0x5b, 0x6f, 0x5d, 0x7c, 0x7c, 0x59, 0x28, 0x74, 0x2c,
+  0x6f, 0x2c, 0x6e, 0x5b, 0x6f, 0x5d, 0x2c, 0x65, 0x5b, 0x6f, 0x5d, 0x2c,
+  0x69, 0x29, 0x7d, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20,
+  0x58, 0x28, 0x74, 0x2c, 0x6e, 0x2c, 0x65, 0x29, 0x7b, 0x22, 0x2d, 0x22,
+  0x3d, 0x3d, 0x3d, 0x6e, 0x5b, 0x30, 0x5d, 0x3f, 0x74, 0x2e, 0x73, 0x65,
+  0x74, 0x50, 0x72, 0x6f, 0x70, 0x65, 0x72, 0x74, 0x79, 0x28, 0x6e, 0x2c,
+  0x6e, 0x75, 0x6c, 0x6c, 0x3d, 0x3d, 0x65, 0x3f, 0x22, 0x22, 0x3a, 0x65,
+  0x29, 0x3a, 0x74, 0x5b, 0x6e, 0x5d, 0x3d, 0x6e, 0x75, 0x6c, 0x6c, 0x3d,
+  0x3d, 0x65, 0x3f, 0x22, 0x22, 0x3a, 0x22, 0x6e, 0x75, 0x6d, 0x62, 0x65,
+  0x72, 0x22, 0x21, 0x3d, 0x74, 0x79, 0x70, 0x65, 0x6f, 0x66, 0x20, 0x65,
+  0x7c, 0x7c, 0x24, 0x2e, 0x74, 0x65, 0x73, 0x74, 0x28, 0x6e, 0x29, 0x3f,
+  0x65, 0x3a, 0x65, 0x2b, 0x22, 0x70, 0x78, 0x22, 0x7d, 0x66, 0x75, 0x6e,
+  0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x59, 0x28, 0x74, 0x2c, 0x6e, 0x2c,
+  0x65, 0x2c, 0x69, 0x2c, 0x5f, 0x29, 0x7b, 0x76, 0x61, 0x72, 0x20, 0x6f,
+  0x3b, 0x74, 0x3a, 0x69, 0x66, 0x28, 0x22, 0x73, 0x74, 0x79, 0x6c, 0x65,
+  0x22, 0x3d, 0x3d, 0x3d, 0x6e, 0x29, 0x69, 0x66, 0x28, 0x22, 0x73, 0x74,
+  0x72, 0x69, 0x6e, 0x67, 0x22, 0x3d, 0x3d, 0x74, 0x79, 0x70, 0x65, 0x6f,
+  0x66, 0x20, 0x65, 0x29, 0x74, 0x2e, 0x73, 0x74, 0x79, 0x6c, 0x65, 0x2e,
+  0x63, 0x73, 0x73, 0x54, 0x65, 0x78, 0x74, 0x3d, 0x65, 0x3b, 0x65, 0x6c,
+  0x73, 0x65, 0x7b, 0x69, 0x66, 0x28, 0x22, 0x73, 0x74, 0x72, 0x69, 0x6e,
+  0x67, 0x22, 0x3d, 0x3d, 0x74, 0x79, 0x70, 0x65, 0x6f, 0x66, 0x20, 0x69,
+  0x26, 0x26, 0x28, 0x74, 0x2e, 0x73, 0x74, 0x79, 0x6c, 0x65, 0x2e, 0x63,
+  0x73, 0x73, 0x54, 0x65, 0x78, 0x74, 0x3d, 0x69, 0x3d, 0x22, 0x22, 0x29,
+  0x2c, 0x69, 0x29, 0x66, 0x6f, 0x72, 0x28, 0x6e, 0x20, 0x69, 0x6e, 0x20,
+  0x69, 0x29, 0x65, 0x26, 0x26, 0x6e, 0x20, 0x69, 0x6e, 0x20, 0x65, 0x7c,
+  0x7c, 0x58, 0x28, 0x74, 0x2e, 0x73, 0x74, 0x79, 0x6c, 0x65, 0x2c, 0x6e,
+  0x2c, 0x22, 0x22, 0x29, 0x3b, 0x69, 0x66, 0x28, 0x65, 0x29, 0x66, 0x6f,
+  0x72, 0x28, 0x6e, 0x20, 0x69, 0x6e, 0x20, 0x65, 0x29, 0x69, 0x26, 0x26,
+  0x65, 0x5b, 0x6e, 0x5d, 0x3d, 0x3d, 0x3d, 0x69, 0x5b, 0x6e, 0x5d, 0x7c,
+  0x7c, 0x58, 0x28, 0x74, 0x2e, 0x73, 0x74, 0x79, 0x6c, 0x65, 0x2c, 0x6e,
+  0x2c, 0x65, 0x5b, 0x6e, 0x5d, 0x29, 0x7d, 0x65, 0x6c, 0x73, 0x65, 0x20,
+  0x69, 0x66, 0x28, 0x22, 0x6f, 0x22, 0x3d, 0x3d, 0x3d, 0x6e, 0x5b, 0x30,
+  0x5d, 0x26, 0x26, 0x22, 0x6e, 0x22, 0x3d, 0x3d, 0x3d, 0x6e, 0x5b, 0x31,
+  0x5d, 0x29, 0x6f, 0x3d, 0x6e, 0x21, 0x3d, 0x3d, 0x28, 0x6e, 0x3d, 0x6e,
+  0x2e, 0x72, 0x65, 0x70, 0x6c, 0x61, 0x63, 0x65, 0x28, 0x2f, 0x43, 0x61,
+  0x70, 0x74, 0x75, 0x72, 0x65, 0x24, 0x2f, 0x2c, 0x22, 0x22, 0x29, 0x29,
+  0x2c, 0x6e, 0x3d, 0x6e, 0x2e, 0x74, 0x6f, 0x4c, 0x6f, 0x77, 0x65, 0x72,
+  0x43, 0x61, 0x73, 0x65, 0x28, 0x29, 0x69, 0x6e, 0x20, 0x74, 0x3f, 0x6e,
+  0x2e, 0x74, 0x6f, 0x4c, 0x6f, 0x77, 0x65, 0x72, 0x43, 0x61, 0x73, 0x65,
+  0x28, 0x29, 0x2e, 0x73, 0x6c, 0x69, 0x63, 0x65, 0x28, 0x32, 0x29, 0x3a,
+  0x6e, 0x2e, 0x73, 0x6c, 0x69, 0x63, 0x65, 0x28, 0x32, 0x29, 0x2c, 0x74,
+  0x2e, 0x6c, 0x7c, 0x7c, 0x28, 0x74, 0x2e, 0x6c, 0x3d, 0x7b, 0x7d, 0x29,
+  0x2c, 0x74, 0x2e, 0x6c, 0x5b, 0x6e, 0x2b, 0x6f, 0x5d, 0x3d, 0x65, 0x2c,
+  0x65, 0x3f, 0x69, 0x7c, 0x7c, 0x74, 0x2e, 0x61, 0x64, 0x64, 0x45, 0x76,
+  0x65, 0x6e, 0x74, 0x4c, 0x69, 0x73, 0x74, 0x65, 0x6e, 0x65, 0x72, 0x28,
+  0x6e, 0x2c, 0x6f, 0x3f, 0x74, 0x74, 0x3a, 0x5a, 0x2c, 0x6f, 0x29, 0x3a,
+  0x74, 0x2e, 0x72, 0x65, 0x6d, 0x6f, 0x76, 0x65, 0x45, 0x76, 0x65, 0x6e,
+  0x74, 0x4c, 0x69, 0x73, 0x74, 0x65, 0x6e, 0x65, 0x72, 0x28, 0x6e, 0x2c,
+  0x6f, 0x3f, 0x74, 0x74, 0x3a, 0x5a, 0x2c, 0x6f, 0x29, 0x3b, 0x65, 0x6c,
+  0x73, 0x65, 0x20, 0x69, 0x66, 0x28, 0x22, 0x64, 0x61, 0x6e, 0x67, 0x65,
+  0x72, 0x6f, 0x75, 0x73, 0x6c, 0x79, 0x53, 0x65, 0x74, 0x49, 0x6e, 0x6e,
+  0x65, 0x72, 0x48, 0x54, 0x4d, 0x4c, 0x22, 0x21, 0x3d, 0x3d, 0x6e, 0x29,
+  0x7b, 0x69, 0x66, 0x28, 0x5f, 0x29, 0x6e, 0x3d, 0x6e, 0x2e, 0x72, 0x65,
+  0x70, 0x6c, 0x61, 0x63, 0x65, 0x28, 0x2f, 0x78, 0x6c, 0x69, 0x6e, 0x6b,
+  0x28, 0x48, 0x7c, 0x3a, 0x68, 0x29, 0x2f, 0x2c, 0x22, 0x68, 0x22, 0x29,
+  0x2e, 0x72, 0x65, 0x70, 0x6c, 0x61, 0x63, 0x65, 0x28, 0x2f, 0x73, 0x4e,
+  0x61, 0x6d, 0x65, 0x24, 0x2f, 0x2c, 0x22, 0x73, 0x22, 0x29, 0x3b, 0x65,
+  0x6c, 0x73, 0x65, 0x20, 0x69, 0x66, 0x28, 0x22, 0x77, 0x69, 0x64, 0x74,
+  0x68, 0x22, 0x21, 0x3d, 0x3d, 0x6e, 0x26, 0x26, 0x22, 0x68, 0x65, 0x69,
+  0x67, 0x68, 0x74, 0x22, 0x21, 0x3d, 0x3d, 0x6e, 0x26, 0x26, 0x22, 0x68,
+  0x72, 0x65, 0x66, 0x22, 0x21, 0x3d, 0x3d, 0x6e, 0x26, 0x26, 0x22, 0x6c,
+  0x69, 0x73, 0x74, 0x22, 0x21, 0x3d, 0x3d, 0x6e, 0x26, 0x26, 0x22, 0x66,
+  0x6f, 0x72, 0x6d, 0x22, 0x21, 0x3d, 0x3d, 0x6e, 0x26, 0x26, 0x22, 0x74,
+  0x61, 0x62, 0x49, 0x6e, 0x64, 0x65, 0x78, 0x22, 0x21, 0x3d, 0x3d, 0x6e,
+  0x26, 0x26, 0x22, 0x64, 0x6f, 0x77, 0x6e, 0x6c, 0x6f, 0x61, 0x64, 0x22,
+  0x21, 0x3d, 0x3d, 0x6e, 0x26, 0x26, 0x22, 0x72, 0x6f, 0x77, 0x53, 0x70,
+  0x61, 0x6e, 0x22, 0x21, 0x3d, 0x3d, 0x6e, 0x26, 0x26, 0x22, 0x63, 0x6f,
+  0x6c, 0x53, 0x70, 0x61, 0x6e, 0x22, 0x21, 0x3d, 0x3d, 0x6e, 0x26, 0x26,
+  0x6e, 0x20, 0x69, 0x6e, 0x20, 0x74, 0x29, 0x74, 0x72, 0x79, 0x7b, 0x74,
+  0x5b, 0x6e, 0x5d, 0x3d, 0x6e, 0x75, 0x6c, 0x6c, 0x3d, 0x3d, 0x65, 0x3f,
+  0x22, 0x22, 0x3a, 0x65, 0x3b, 0x62, 0x72, 0x65, 0x61, 0x6b, 0x20, 0x74,
+  0x7d, 0x63, 0x61, 0x74, 0x63, 0x68, 0x28, 0x74, 0x29, 0x7b, 0x7d, 0x22,
+  0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x22, 0x3d, 0x3d, 0x74,
+  0x79, 0x70, 0x65, 0x6f, 0x66, 0x20, 0x65, 0x7c, 0x7c, 0x28, 0x6e, 0x75,
+  0x6c, 0x6c, 0x3d, 0x3d, 0x65, 0x7c, 0x7c, 0x21, 0x31, 0x3d, 0x3d, 0x3d,
+  0x65, 0x26, 0x26, 0x22, 0x2d, 0x22, 0x21, 0x3d, 0x3d, 0x6e, 0x5b, 0x34,
+  0x5d, 0x3f, 0x74, 0x2e, 0x72, 0x65, 0x6d, 0x6f, 0x76, 0x65, 0x41, 0x74,
+  0x74, 0x72, 0x69, 0x62, 0x75, 0x74, 0x65, 0x28, 0x6e, 0x29, 0x3a, 0x74,
+  0x2e, 0x73, 0x65, 0x74, 0x41, 0x74, 0x74, 0x72, 0x69, 0x62, 0x75, 0x74,
+  0x65, 0x28, 0x6e, 0x2c, 0x65, 0x29, 0x29, 0x7d, 0x7d, 0x66, 0x75, 0x6e,
+  0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x5a, 0x28, 0x74, 0x29, 0x7b, 0x72,
+  0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x6c,
+  0x5b, 0x74, 0x2e, 0x74, 0x79, 0x70, 0x65, 0x2b, 0x21, 0x31, 0x5d, 0x28,
+  0x53, 0x2e, 0x65, 0x76, 0x65, 0x6e, 0x74, 0x3f, 0x53, 0x2e, 0x65, 0x76,
+  0x65, 0x6e, 0x74, 0x28, 0x74, 0x29, 0x3a, 0x74, 0x29, 0x7d, 0x66, 0x75,
+  0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x74, 0x74, 0x28, 0x74, 0x29,
+  0x7b, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x74, 0x68, 0x69, 0x73,
+  0x2e, 0x6c, 0x5b, 0x74, 0x2e, 0x74, 0x79, 0x70, 0x65, 0x2b, 0x21, 0x30,
+  0x5d, 0x28, 0x53, 0x2e, 0x65, 0x76, 0x65, 0x6e, 0x74, 0x3f, 0x53, 0x2e,
+  0x65, 0x76, 0x65, 0x6e, 0x74, 0x28, 0x74, 0x29, 0x3a, 0x74, 0x29, 0x7d,
+  0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x6e, 0x74, 0x28,
+  0x74, 0x2c, 0x6e, 0x2c, 0x65, 0x2c, 0x69, 0x2c, 0x5f, 0x2c, 0x6f, 0x2c,
+  0x72, 0x2c, 0x75, 0x2c, 0x6c, 0x29, 0x7b, 0x76, 0x61, 0x72, 0x20, 0x66,
+  0x2c, 0x73, 0x2c, 0x63, 0x2c, 0x68, 0x2c, 0x61, 0x2c, 0x70, 0x2c, 0x64,
+  0x2c, 0x76, 0x2c, 0x79, 0x2c, 0x6d, 0x2c, 0x67, 0x2c, 0x62, 0x2c, 0x6b,
+  0x2c, 0x78, 0x2c, 0x77, 0x2c, 0x43, 0x3d, 0x6e, 0x2e, 0x74, 0x79, 0x70,
+  0x65, 0x3b, 0x69, 0x66, 0x28, 0x76, 0x6f, 0x69, 0x64, 0x20, 0x30, 0x21,
+  0x3d, 0x3d, 0x6e, 0x2e, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x72, 0x75, 0x63,
+  0x74, 0x6f, 0x72, 0x29, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x6e,
+  0x75, 0x6c, 0x6c, 0x3b, 0x6e, 0x75, 0x6c, 0x6c, 0x21, 0x3d, 0x65, 0x2e,
+  0x5f, 0x5f, 0x68, 0x26, 0x26, 0x28, 0x6c, 0x3d, 0x65, 0x2e, 0x5f, 0x5f,
+  0x68, 0x2c, 0x75, 0x3d, 0x6e, 0x2e, 0x5f, 0x5f, 0x65, 0x3d, 0x65, 0x2e,
+  0x5f, 0x5f, 0x65, 0x2c, 0x6e, 0x2e, 0x5f, 0x5f, 0x68, 0x3d, 0x6e, 0x75,
+  0x6c, 0x6c, 0x2c, 0x6f, 0x3d, 0x5b, 0x75, 0x5d, 0x29, 0x2c, 0x28, 0x66,
+  0x3d, 0x53, 0x2e, 0x5f, 0x5f, 0x62, 0x29, 0x26, 0x26, 0x66, 0x28, 0x6e,
+  0x29, 0x3b, 0x74, 0x72, 0x79, 0x7b, 0x74, 0x3a, 0x69, 0x66, 0x28, 0x22,
+  0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x22, 0x3d, 0x3d, 0x74,
+  0x79, 0x70, 0x65, 0x6f, 0x66, 0x20, 0x43, 0x29, 0x7b, 0x69, 0x66, 0x28,
+  0x76, 0x3d, 0x6e, 0x2e, 0x70, 0x72, 0x6f, 0x70, 0x73, 0x2c, 0x79, 0x3d,
+  0x28, 0x66, 0x3d, 0x43, 0x2e, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x78, 0x74,
+  0x54, 0x79, 0x70, 0x65, 0x29, 0x26, 0x26, 0x69, 0x5b, 0x66, 0x2e, 0x5f,
+  0x5f, 0x63, 0x5d, 0x2c, 0x6d, 0x3d, 0x66, 0x3f, 0x79, 0x3f, 0x79, 0x2e,
+  0x70, 0x72, 0x6f, 0x70, 0x73, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x3a,
+  0x66, 0x2e, 0x5f, 0x5f, 0x3a, 0x69, 0x2c, 0x65, 0x2e, 0x5f, 0x5f, 0x63,
+  0x3f, 0x64, 0x3d, 0x28, 0x73, 0x3d, 0x6e, 0x2e, 0x5f, 0x5f, 0x63, 0x3d,
+  0x65, 0x2e, 0x5f, 0x5f, 0x63, 0x29, 0x2e, 0x5f, 0x5f, 0x3d, 0x73, 0x2e,
+  0x5f, 0x5f, 0x45, 0x3a, 0x28, 0x22, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x74,
+  0x79, 0x70, 0x65, 0x22, 0x69, 0x6e, 0x20, 0x43, 0x26, 0x26, 0x43, 0x2e,
+  0x70, 0x72, 0x6f, 0x74, 0x6f, 0x74, 0x79, 0x70, 0x65, 0x2e, 0x72, 0x65,
+  0x6e, 0x64, 0x65, 0x72, 0x3f, 0x6e, 0x2e, 0x5f, 0x5f, 0x63, 0x3d, 0x73,
+  0x3d, 0x6e, 0x65, 0x77, 0x20, 0x43, 0x28, 0x76, 0x2c, 0x6d, 0x29, 0x3a,
+  0x28, 0x6e, 0x2e, 0x5f, 0x5f, 0x63, 0x3d, 0x73, 0x3d, 0x6e, 0x65, 0x77,
+  0x20, 0x4c, 0x28, 0x76, 0x2c, 0x6d, 0x29, 0x2c, 0x73, 0x2e, 0x63, 0x6f,
+  0x6e, 0x73, 0x74, 0x72, 0x75, 0x63, 0x74, 0x6f, 0x72, 0x3d, 0x43, 0x2c,
+  0x73, 0x2e, 0x72, 0x65, 0x6e, 0x64, 0x65, 0x72, 0x3d, 0x72, 0x74, 0x29,
+  0x2c, 0x79, 0x26, 0x26, 0x79, 0x2e, 0x73, 0x75, 0x62, 0x28, 0x73, 0x29,
+  0x2c, 0x73, 0x2e, 0x70, 0x72, 0x6f, 0x70, 0x73, 0x3d, 0x76, 0x2c, 0x73,
+  0x2e, 0x73, 0x74, 0x61, 0x74, 0x65, 0x7c, 0x7c, 0x28, 0x73, 0x2e, 0x73,
+  0x74, 0x61, 0x74, 0x65, 0x3d, 0x7b, 0x7d, 0x29, 0x2c, 0x73, 0x2e, 0x63,
+  0x6f, 0x6e, 0x74, 0x65, 0x78, 0x74, 0x3d, 0x6d, 0x2c, 0x73, 0x2e, 0x5f,
+  0x5f, 0x6e, 0x3d, 0x69, 0x2c, 0x63, 0x3d, 0x73, 0x2e, 0x5f, 0x5f, 0x64,
+  0x3d, 0x21, 0x30, 0x2c, 0x73, 0x2e, 0x5f, 0x5f, 0x68, 0x3d, 0x5b, 0x5d,
+  0x2c, 0x73, 0x2e, 0x5f, 0x73, 0x62, 0x3d, 0x5b, 0x5d, 0x29, 0x2c, 0x6e,
+  0x75, 0x6c, 0x6c, 0x3d, 0x3d, 0x73, 0x2e, 0x5f, 0x5f, 0x73, 0x26, 0x26,
+  0x28, 0x73, 0x2e, 0x5f, 0x5f, 0x73, 0x3d, 0x73, 0x2e, 0x73, 0x74, 0x61,
+  0x74, 0x65, 0x29, 0x2c, 0x6e, 0x75, 0x6c, 0x6c, 0x21, 0x3d, 0x43, 0x2e,
+  0x67, 0x65, 0x74, 0x44, 0x65, 0x72, 0x69, 0x76, 0x65, 0x64, 0x53, 0x74,
+  0x61, 0x74, 0x65, 0x46, 0x72, 0x6f, 0x6d, 0x50, 0x72, 0x6f, 0x70, 0x73,
+  0x26, 0x26, 0x28, 0x73, 0x2e, 0x5f, 0x5f, 0x73, 0x3d, 0x3d, 0x73, 0x2e,
+  0x73, 0x74, 0x61, 0x74, 0x65, 0x26, 0x26, 0x28, 0x73, 0x2e, 0x5f, 0x5f,
+  0x73, 0x3d, 0x56, 0x28, 0x7b, 0x7d, 0x2c, 0x73, 0x2e, 0x5f, 0x5f, 0x73,
+  0x29, 0x29, 0x2c, 0x56, 0x28, 0x73, 0x2e, 0x5f, 0x5f, 0x73, 0x2c, 0x43,
+  0x2e, 0x67, 0x65, 0x74, 0x44, 0x65, 0x72, 0x69, 0x76, 0x65, 0x64, 0x53,
+  0x74, 0x61, 0x74, 0x65, 0x46, 0x72, 0x6f, 0x6d, 0x50, 0x72, 0x6f, 0x70,
+  0x73, 0x28, 0x76, 0x2c, 0x73, 0x2e, 0x5f, 0x5f, 0x73, 0x29, 0x29, 0x29,
+  0x2c, 0x68, 0x3d, 0x73, 0x2e, 0x70, 0x72, 0x6f, 0x70, 0x73, 0x2c, 0x61,
+  0x3d, 0x73, 0x2e, 0x73, 0x74, 0x61, 0x74, 0x65, 0x2c, 0x73, 0x2e, 0x5f,
+  0x5f, 0x76, 0x3d, 0x6e, 0x2c, 0x63, 0x29, 0x6e, 0x75, 0x6c, 0x6c, 0x3d,
+  0x3d, 0x43, 0x2e, 0x67, 0x65, 0x74, 0x44, 0x65, 0x72, 0x69, 0x76, 0x65,
+  0x64, 0x53, 0x74, 0x61, 0x74, 0x65, 0x46, 0x72, 0x6f, 0x6d, 0x50, 0x72,
+  0x6f, 0x70, 0x73, 0x26, 0x26, 0x6e, 0x75, 0x6c, 0x6c, 0x21, 0x3d, 0x73,
+  0x2e, 0x63, 0x6f, 0x6d, 0x70, 0x6f, 0x6e, 0x65, 0x6e, 0x74, 0x57, 0x69,
+  0x6c, 0x6c, 0x4d, 0x6f, 0x75, 0x6e, 0x74, 0x26, 0x26, 0x73, 0x2e, 0x63,
+  0x6f, 0x6d, 0x70, 0x6f, 0x6e, 0x65, 0x6e, 0x74, 0x57, 0x69, 0x6c, 0x6c,
+  0x4d, 0x6f, 0x75, 0x6e, 0x74, 0x28, 0x29, 0x2c, 0x6e, 0x75, 0x6c, 0x6c,
+  0x21, 0x3d, 0x73, 0x2e, 0x63, 0x6f, 0x6d, 0x70, 0x6f, 0x6e, 0x65, 0x6e,
+  0x74, 0x44, 0x69, 0x64, 0x4d, 0x6f, 0x75, 0x6e, 0x74, 0x26, 0x26, 0x73,
+  0x2e, 0x5f, 0x5f, 0x68, 0x2e, 0x70, 0x75, 0x73, 0x68, 0x28, 0x73, 0x2e,
+  0x63, 0x6f, 0x6d, 0x70, 0x6f, 0x6e, 0x65, 0x6e, 0x74, 0x44, 0x69, 0x64,
+  0x4d, 0x6f, 0x75, 0x6e, 0x74, 0x29, 0x3b, 0x65, 0x6c, 0x73, 0x65, 0x7b,
+  0x69, 0x66, 0x28, 0x6e, 0x75, 0x6c, 0x6c, 0x3d, 0x3d, 0x43, 0x2e, 0x67,
+  0x65, 0x74, 0x44, 0x65, 0x72, 0x69, 0x76, 0x65, 0x64, 0x53, 0x74, 0x61,
+  0x74, 0x65, 0x46, 0x72, 0x6f, 0x6d, 0x50, 0x72, 0x6f, 0x70, 0x73, 0x26,
+  0x26, 0x76, 0x21, 0x3d, 0x3d, 0x68, 0x26, 0x26, 0x6e, 0x75, 0x6c, 0x6c,
+  0x21, 0x3d, 0x73, 0x2e, 0x63, 0x6f, 0x6d, 0x70, 0x6f, 0x6e, 0x65, 0x6e,
+  0x74, 0x57, 0x69, 0x6c, 0x6c, 0x52, 0x65, 0x63, 0x65, 0x69, 0x76, 0x65,
+  0x50, 0x72, 0x6f, 0x70, 0x73, 0x26, 0x26, 0x73, 0x2e, 0x63, 0x6f, 0x6d,
+  0x70, 0x6f, 0x6e, 0x65, 0x6e, 0x74, 0x57, 0x69, 0x6c, 0x6c, 0x52, 0x65,
+  0x63, 0x65, 0x69, 0x76, 0x65, 0x50, 0x72, 0x6f, 0x70, 0x73, 0x28, 0x76,
+  0x2c, 0x6d, 0x29, 0x2c, 0x21, 0x73, 0x2e, 0x5f, 0x5f, 0x65, 0x26, 0x26,
+  0x6e, 0x75, 0x6c, 0x6c, 0x21, 0x3d, 0x73, 0x2e, 0x73, 0x68, 0x6f, 0x75,
+  0x6c, 0x64, 0x43, 0x6f, 0x6d, 0x70, 0x6f, 0x6e, 0x65, 0x6e, 0x74, 0x55,
+  0x70, 0x64, 0x61, 0x74, 0x65, 0x26, 0x26, 0x21, 0x31, 0x3d, 0x3d, 0x3d,
+  0x73, 0x2e, 0x73, 0x68, 0x6f, 0x75, 0x6c, 0x64, 0x43, 0x6f, 0x6d, 0x70,
+  0x6f, 0x6e, 0x65, 0x6e, 0x74, 0x55, 0x70, 0x64, 0x61, 0x74, 0x65, 0x28,
+  0x76, 0x2c, 0x73, 0x2e, 0x5f, 0x5f, 0x73, 0x2c, 0x6d, 0x29, 0x7c, 0x7c,
+  0x6e, 0x2e, 0x5f, 0x5f, 0x76, 0x3d, 0x3d, 0x3d, 0x65, 0x2e, 0x5f, 0x5f,
+  0x76, 0x29, 0x7b, 0x66, 0x6f, 0x72, 0x28, 0x6e, 0x2e, 0x5f, 0x5f, 0x76,
+  0x21, 0x3d, 0x3d, 0x65, 0x2e, 0x5f, 0x5f, 0x76, 0x26, 0x26, 0x28, 0x73,
+  0x2e, 0x70, 0x72, 0x6f, 0x70, 0x73, 0x3d, 0x76, 0x2c, 0x73, 0x2e, 0x73,
+  0x74, 0x61, 0x74, 0x65, 0x3d, 0x73, 0x2e, 0x5f, 0x5f, 0x73, 0x2c, 0x73,
+  0x2e, 0x5f, 0x5f, 0x64, 0x3d, 0x21, 0x31, 0x29, 0x2c, 0x73, 0x2e, 0x5f,
+  0x5f, 0x65, 0x3d, 0x21, 0x31, 0x2c, 0x6e, 0x2e, 0x5f, 0x5f, 0x65, 0x3d,
+  0x65, 0x2e, 0x5f, 0x5f, 0x65, 0x2c, 0x6e, 0x2e, 0x5f, 0x5f, 0x6b, 0x3d,
+  0x65, 0x2e, 0x5f, 0x5f, 0x6b, 0x2c, 0x6e, 0x2e, 0x5f, 0x5f, 0x6b, 0x2e,
+  0x66, 0x6f, 0x72, 0x45, 0x61, 0x63, 0x68, 0x28, 0x28, 0x66, 0x75, 0x6e,
+  0x63, 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x74, 0x29, 0x7b, 0x74, 0x26, 0x26,
+  0x28, 0x74, 0x2e, 0x5f, 0x5f, 0x3d, 0x6e, 0x29, 0x7d, 0x29, 0x29, 0x2c,
+  0x67, 0x3d, 0x30, 0x3b, 0x67, 0x3c, 0x73, 0x2e, 0x5f, 0x73, 0x62, 0x2e,
+  0x6c, 0x65, 0x6e, 0x67, 0x74, 0x68, 0x3b, 0x67, 0x2b, 0x2b, 0x29, 0x73,
+  0x2e, 0x5f, 0x5f, 0x68, 0x2e, 0x70, 0x75, 0x73, 0x68, 0x28, 0x73, 0x2e,
+  0x5f, 0x73, 0x62, 0x5b, 0x67, 0x5d, 0x29, 0x3b, 0x73, 0x2e, 0x5f, 0x73,
+  0x62, 0x3d, 0x5b, 0x5d, 0x2c, 0x73, 0x2e, 0x5f, 0x5f, 0x68, 0x2e, 0x6c,
+  0x65, 0x6e, 0x67, 0x74, 0x68, 0x26, 0x26, 0x72, 0x2e, 0x70, 0x75, 0x73,
+  0x68, 0x28, 0x73, 0x29, 0x3b, 0x62, 0x72, 0x65, 0x61, 0x6b, 0x20, 0x74,
+  0x7d, 0x6e, 0x75, 0x6c, 0x6c, 0x21, 0x3d, 0x73, 0x2e, 0x63, 0x6f, 0x6d,
+  0x70, 0x6f, 0x6e, 0x65, 0x6e, 0x74, 0x57, 0x69, 0x6c, 0x6c, 0x55, 0x70,
+  0x64, 0x61, 0x74, 0x65, 0x26, 0x26, 0x73, 0x2e, 0x63, 0x6f, 0x6d, 0x70,
+  0x6f, 0x6e, 0x65, 0x6e, 0x74, 0x57, 0x69, 0x6c, 0x6c, 0x55, 0x70, 0x64,
+  0x61, 0x74, 0x65, 0x28, 0x76, 0x2c, 0x73, 0x2e, 0x5f, 0x5f, 0x73, 0x2c,
+  0x6d, 0x29, 0x2c, 0x6e, 0x75, 0x6c, 0x6c, 0x21, 0x3d, 0x73, 0x2e, 0x63,
+  0x6f, 0x6d, 0x70, 0x6f, 0x6e, 0x65, 0x6e, 0x74, 0x44, 0x69, 0x64, 0x55,
+  0x70, 0x64, 0x61, 0x74, 0x65, 0x26, 0x26, 0x73, 0x2e, 0x5f, 0x5f, 0x68,
+  0x2e, 0x70, 0x75, 0x73, 0x68, 0x28, 0x28, 0x66, 0x75, 0x6e, 0x63, 0x74,
+  0x69, 0x6f, 0x6e, 0x28, 0x29, 0x7b, 0x73, 0x2e, 0x63, 0x6f, 0x6d, 0x70,
+  0x6f, 0x6e, 0x65, 0x6e, 0x74, 0x44, 0x69, 0x64, 0x55, 0x70, 0x64, 0x61,
+  0x74, 0x65, 0x28, 0x68, 0x2c, 0x61, 0x2c, 0x70, 0x29, 0x7d, 0x29, 0x29,
+  0x7d, 0x69, 0x66, 0x28, 0x73, 0x2e, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x78,
+  0x74, 0x3d, 0x6d, 0x2c, 0x73, 0x2e, 0x70, 0x72, 0x6f, 0x70, 0x73, 0x3d,
+  0x76, 0x2c, 0x73, 0x2e, 0x5f, 0x5f, 0x50, 0x3d, 0x74, 0x2c, 0x62, 0x3d,
+  0x53, 0x2e, 0x5f, 0x5f, 0x72, 0x2c, 0x6b, 0x3d, 0x30, 0x2c, 0x22, 0x70,
+  0x72, 0x6f, 0x74, 0x6f, 0x74, 0x79, 0x70, 0x65, 0x22, 0x69, 0x6e, 0x20,
+  0x43, 0x26, 0x26, 0x43, 0x2e, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x74, 0x79,
+  0x70, 0x65, 0x2e, 0x72, 0x65, 0x6e, 0x64, 0x65, 0x72, 0x29, 0x7b, 0x66,
+  0x6f, 0x72, 0x28, 0x73, 0x2e, 0x73, 0x74, 0x61, 0x74, 0x65, 0x3d, 0x73,
+  0x2e, 0x5f, 0x5f, 0x73, 0x2c, 0x73, 0x2e, 0x5f, 0x5f, 0x64, 0x3d, 0x21,
+  0x31, 0x2c, 0x62, 0x26, 0x26, 0x62, 0x28, 0x6e, 0x29, 0x2c, 0x66, 0x3d,
+  0x73, 0x2e, 0x72, 0x65, 0x6e, 0x64, 0x65, 0x72, 0x28, 0x73, 0x2e, 0x70,
+  0x72, 0x6f, 0x70, 0x73, 0x2c, 0x73, 0x2e, 0x73, 0x74, 0x61, 0x74, 0x65,
+  0x2c, 0x73, 0x2e, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x78, 0x74, 0x29, 0x2c,
+  0x78, 0x3d, 0x30, 0x3b, 0x78, 0x3c, 0x73, 0x2e, 0x5f, 0x73, 0x62, 0x2e,
+  0x6c, 0x65, 0x6e, 0x67, 0x74, 0x68, 0x3b, 0x78, 0x2b, 0x2b, 0x29, 0x73,
+  0x2e, 0x5f, 0x5f, 0x68, 0x2e, 0x70, 0x75, 0x73, 0x68, 0x28, 0x73, 0x2e,
+  0x5f, 0x73, 0x62, 0x5b, 0x78, 0x5d, 0x29, 0x3b, 0x73, 0x2e, 0x5f, 0x73,
+  0x62, 0x3d, 0x5b, 0x5d, 0x7d, 0x65, 0x6c, 0x73, 0x65, 0x20, 0x64, 0x6f,
+  0x7b, 0x73, 0x2e, 0x5f, 0x5f, 0x64, 0x3d, 0x21, 0x31, 0x2c, 0x62, 0x26,
+  0x26, 0x62, 0x28, 0x6e, 0x29, 0x2c, 0x66, 0x3d, 0x73, 0x2e, 0x72, 0x65,
+  0x6e, 0x64, 0x65, 0x72, 0x28, 0x73, 0x2e, 0x70, 0x72, 0x6f, 0x70, 0x73,
+  0x2c, 0x73, 0x2e, 0x73, 0x74, 0x61, 0x74, 0x65, 0x2c, 0x73, 0x2e, 0x63,
+  0x6f, 0x6e, 0x74, 0x65, 0x78, 0x74, 0x29, 0x2c, 0x73, 0x2e, 0x73, 0x74,
+  0x61, 0x74, 0x65, 0x3d, 0x73, 0x2e, 0x5f, 0x5f, 0x73, 0x7d, 0x77, 0x68,
+  0x69, 0x6c, 0x65, 0x28, 0x73, 0x2e, 0x5f, 0x5f, 0x64, 0x26, 0x26, 0x2b,
+  0x2b, 0x6b, 0x3c, 0x32, 0x35, 0x29, 0x3b, 0x73, 0x2e, 0x73, 0x74, 0x61,
+  0x74, 0x65, 0x3d, 0x73, 0x2e, 0x5f, 0x5f, 0x73, 0x2c, 0x6e, 0x75, 0x6c,
+  0x6c, 0x21, 0x3d, 0x73, 0x2e, 0x67, 0x65, 0x74, 0x43, 0x68, 0x69, 0x6c,
+  0x64, 0x43, 0x6f, 0x6e, 0x74, 0x65, 0x78, 0x74, 0x26, 0x26, 0x28, 0x69,
+  0x3d, 0x56, 0x28, 0x56, 0x28, 0x7b, 0x7d, 0x2c, 0x69, 0x29, 0x2c, 0x73,
+  0x2e, 0x67, 0x65, 0x74, 0x43, 0x68, 0x69, 0x6c, 0x64, 0x43, 0x6f, 0x6e,
+  0x74, 0x65, 0x78, 0x74, 0x28, 0x29, 0x29, 0x29, 0x2c, 0x63, 0x7c, 0x7c,
+  0x6e, 0x75, 0x6c, 0x6c, 0x3d, 0x3d, 0x73, 0x2e, 0x67, 0x65, 0x74, 0x53,
+  0x6e, 0x61, 0x70, 0x73, 0x68, 0x6f, 0x74, 0x42, 0x65, 0x66, 0x6f, 0x72,
+  0x65, 0x55, 0x70, 0x64, 0x61, 0x74, 0x65, 0x7c, 0x7c, 0x28, 0x70, 0x3d,
+  0x73, 0x2e, 0x67, 0x65, 0x74, 0x53, 0x6e, 0x61, 0x70, 0x73, 0x68, 0x6f,
+  0x74, 0x42, 0x65, 0x66, 0x6f, 0x72, 0x65, 0x55, 0x70, 0x64, 0x61, 0x74,
+  0x65, 0x28, 0x68, 0x2c, 0x61, 0x29, 0x29, 0x2c, 0x42, 0x28, 0x74, 0x2c,
+  0x54, 0x28, 0x77, 0x3d, 0x6e, 0x75, 0x6c, 0x6c, 0x21, 0x3d, 0x66, 0x26,
+  0x26, 0x66, 0x2e, 0x74, 0x79, 0x70, 0x65, 0x3d, 0x3d, 0x3d, 0x4f, 0x26,
+  0x26, 0x6e, 0x75, 0x6c, 0x6c, 0x3d, 0x3d, 0x66, 0x2e, 0x6b, 0x65, 0x79,
+  0x3f, 0x66, 0x2e, 0x70, 0x72, 0x6f, 0x70, 0x73, 0x2e, 0x63, 0x68, 0x69,
+  0x6c, 0x64, 0x72, 0x65, 0x6e, 0x3a, 0x66, 0x29, 0x3f, 0x77, 0x3a, 0x5b,
+  0x77, 0x5d, 0x2c, 0x6e, 0x2c, 0x65, 0x2c, 0x69, 0x2c, 0x5f, 0x2c, 0x6f,
+  0x2c, 0x72, 0x2c, 0x75, 0x2c, 0x6c, 0x29, 0x2c, 0x73, 0x2e, 0x62, 0x61,
+  0x73, 0x65, 0x3d, 0x6e, 0x2e, 0x5f, 0x5f, 0x65, 0x2c, 0x6e, 0x2e, 0x5f,
+  0x5f, 0x68, 0x3d, 0x6e, 0x75, 0x6c, 0x6c, 0x2c, 0x73, 0x2e, 0x5f, 0x5f,
+  0x68, 0x2e, 0x6c, 0x65, 0x6e, 0x67, 0x74, 0x68, 0x26, 0x26, 0x72, 0x2e,
+  0x70, 0x75, 0x73, 0x68, 0x28, 0x73, 0x29, 0x2c, 0x64, 0x26, 0x26, 0x28,
+  0x73, 0x2e, 0x5f, 0x5f, 0x45, 0x3d, 0x73, 0x2e, 0x5f, 0x5f, 0x3d, 0x6e,
+  0x75, 0x6c, 0x6c, 0x29, 0x2c, 0x73, 0x2e, 0x5f, 0x5f, 0x65, 0x3d, 0x21,
+  0x31, 0x7d, 0x65, 0x6c, 0x73, 0x65, 0x20, 0x6e, 0x75, 0x6c, 0x6c, 0x3d,
+  0x3d, 0x6f, 0x26, 0x26, 0x6e, 0x2e, 0x5f, 0x5f, 0x76, 0x3d, 0x3d, 0x3d,
+  0x65, 0x2e, 0x5f, 0x5f, 0x76, 0x3f, 0x28, 0x6e, 0x2e, 0x5f, 0x5f, 0x6b,
+  0x3d, 0x65, 0x2e, 0x5f, 0x5f, 0x6b, 0x2c, 0x6e, 0x2e, 0x5f, 0x5f, 0x65,
+  0x3d, 0x65, 0x2e, 0x5f, 0x5f, 0x65, 0x29, 0x3a, 0x6e, 0x2e, 0x5f, 0x5f,
+  0x65, 0x3d, 0x69, 0x74, 0x28, 0x65, 0x2e, 0x5f, 0x5f, 0x65, 0x2c, 0x6e,
+  0x2c, 0x65, 0x2c, 0x69, 0x2c, 0x5f, 0x2c, 0x6f, 0x2c, 0x72, 0x2c, 0x6c,
+  0x29, 0x3b, 0x28, 0x66, 0x3d, 0x53, 0x2e, 0x64, 0x69, 0x66, 0x66, 0x65,
+  0x64, 0x29, 0x26, 0x26, 0x66, 0x28, 0x6e, 0x29, 0x7d, 0x63, 0x61, 0x74,
+  0x63, 0x68, 0x28, 0x74, 0x29, 0x7b, 0x6e, 0x2e, 0x5f, 0x5f, 0x76, 0x3d,
+  0x6e, 0x75, 0x6c, 0x6c, 0x2c, 0x28, 0x6c, 0x7c, 0x7c, 0x6e, 0x75, 0x6c,
+  0x6c, 0x21, 0x3d, 0x6f, 0x29, 0x26, 0x26, 0x28, 0x6e, 0x2e, 0x5f, 0x5f,
+  0x65, 0x3d, 0x75, 0x2c, 0x6e, 0x2e, 0x5f, 0x5f, 0x68, 0x3d, 0x21, 0x21,
+  0x6c, 0x2c, 0x6f, 0x5b, 0x6f, 0x2e, 0x69, 0x6e, 0x64, 0x65, 0x78, 0x4f,
+  0x66, 0x28, 0x75, 0x29, 0x5d, 0x3d, 0x6e, 0x75, 0x6c, 0x6c, 0x29, 0x2c,
+  0x53, 0x2e, 0x5f, 0x5f, 0x65, 0x28, 0x74, 0x2c, 0x6e, 0x2c, 0x65, 0x29,
+  0x7d, 0x7d, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x65,
+  0x74, 0x28, 0x74, 0x2c, 0x6e, 0x29, 0x7b, 0x53, 0x2e, 0x5f, 0x5f, 0x63,
+  0x26, 0x26, 0x53, 0x2e, 0x5f, 0x5f, 0x63, 0x28, 0x6e, 0x2c, 0x74, 0x29,
+  0x2c, 0x74, 0x2e, 0x73, 0x6f, 0x6d, 0x65, 0x28, 0x28, 0x66, 0x75, 0x6e,
+  0x63, 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x6e, 0x29, 0x7b, 0x74, 0x72, 0x79,
+  0x7b, 0x74, 0x3d, 0x6e, 0x2e, 0x5f, 0x5f, 0x68, 0x2c, 0x6e, 0x2e, 0x5f,
+  0x5f, 0x68, 0x3d, 0x5b, 0x5d, 0x2c, 0x74, 0x2e, 0x73, 0x6f, 0x6d, 0x65,
+  0x28, 0x28, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x74,
+  0x29, 0x7b, 0x74, 0x2e, 0x63, 0x61, 0x6c, 0x6c, 0x28, 0x6e, 0x29, 0x7d,
+  0x29, 0x29, 0x7d, 0x63, 0x61, 0x74, 0x63, 0x68, 0x28, 0x74, 0x29, 0x7b,
+  0x53, 0x2e, 0x5f, 0x5f, 0x65, 0x28, 0x74, 0x2c, 0x6e, 0x2e, 0x5f, 0x5f,
+  0x76, 0x29, 0x7d, 0x7d, 0x29, 0x29, 0x7d, 0x66, 0x75, 0x6e, 0x63, 0x74,
+  0x69, 0x6f, 0x6e, 0x20, 0x69, 0x74, 0x28, 0x74, 0x2c, 0x6e, 0x2c, 0x65,
+  0x2c, 0x69, 0x2c, 0x5f, 0x2c, 0x6f, 0x2c, 0x72, 0x2c, 0x75, 0x29, 0x7b,
+  0x76, 0x61, 0x72, 0x20, 0x6c, 0x2c, 0x66, 0x2c, 0x73, 0x2c, 0x63, 0x3d,
+  0x65, 0x2e, 0x70, 0x72, 0x6f, 0x70, 0x73, 0x2c, 0x68, 0x3d, 0x6e, 0x2e,
+  0x70, 0x72, 0x6f, 0x70, 0x73, 0x2c, 0x61, 0x3d, 0x6e, 0x2e, 0x74, 0x79,
+  0x70, 0x65, 0x2c, 0x70, 0x3d, 0x30, 0x3b, 0x69, 0x66, 0x28, 0x22, 0x73,
+  0x76, 0x67, 0x22, 0x3d, 0x3d, 0x3d, 0x61, 0x26, 0x26, 0x28, 0x5f, 0x3d,
+  0x21, 0x30, 0x29, 0x2c, 0x6e, 0x75, 0x6c, 0x6c, 0x21, 0x3d, 0x6f, 0x29,
+  0x66, 0x6f, 0x72, 0x28, 0x3b, 0x70, 0x3c, 0x6f, 0x2e, 0x6c, 0x65, 0x6e,
+  0x67, 0x74, 0x68, 0x3b, 0x70, 0x2b, 0x2b, 0x29, 0x69, 0x66, 0x28, 0x28,
+  0x6c, 0x3d, 0x6f, 0x5b, 0x70, 0x5d, 0x29, 0x26, 0x26, 0x22, 0x73, 0x65,
+  0x74, 0x41, 0x74, 0x74, 0x72, 0x69, 0x62, 0x75, 0x74, 0x65, 0x22, 0x69,
+  0x6e, 0x20, 0x6c, 0x3d, 0x3d, 0x21, 0x21, 0x61, 0x26, 0x26, 0x28, 0x61,
+  0x3f, 0x6c, 0x2e, 0x6c, 0x6f, 0x63, 0x61, 0x6c, 0x4e, 0x61, 0x6d, 0x65,
+  0x3d, 0x3d, 0x3d, 0x61, 0x3a, 0x33, 0x3d, 0x3d, 0x3d, 0x6c, 0x2e, 0x6e,
+  0x6f, 0x64, 0x65, 0x54, 0x79, 0x70, 0x65, 0x29, 0x29, 0x7b, 0x74, 0x3d,
+  0x6c, 0x2c, 0x6f, 0x5b, 0x70, 0x5d, 0x3d, 0x6e, 0x75, 0x6c, 0x6c, 0x3b,
+  0x62, 0x72, 0x65, 0x61, 0x6b, 0x7d, 0x69, 0x66, 0x28, 0x6e, 0x75, 0x6c,
+  0x6c, 0x3d, 0x3d, 0x74, 0x29, 0x7b, 0x69, 0x66, 0x28, 0x6e, 0x75, 0x6c,
+  0x6c, 0x3d, 0x3d, 0x3d, 0x61, 0x29, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e,
+  0x20, 0x64, 0x6f, 0x63, 0x75, 0x6d, 0x65, 0x6e, 0x74, 0x2e, 0x63, 0x72,
+  0x65, 0x61, 0x74, 0x65, 0x54, 0x65, 0x78, 0x74, 0x4e, 0x6f, 0x64, 0x65,
+  0x28, 0x68, 0x29, 0x3b, 0x74, 0x3d, 0x5f, 0x3f, 0x64, 0x6f, 0x63, 0x75,
+  0x6d, 0x65, 0x6e, 0x74, 0x2e, 0x63, 0x72, 0x65, 0x61, 0x74, 0x65, 0x45,
+  0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x4e, 0x53, 0x28, 0x22, 0x68, 0x74,
+  0x74, 0x70, 0x3a, 0x2f, 0x2f, 0x77, 0x77, 0x77, 0x2e, 0x77, 0x33, 0x2e,
+  0x6f, 0x72, 0x67, 0x2f, 0x32, 0x30, 0x30, 0x30, 0x2f, 0x73, 0x76, 0x67,
+  0x22, 0x2c, 0x61, 0x29, 0x3a, 0x64, 0x6f, 0x63, 0x75, 0x6d, 0x65, 0x6e,
+  0x74, 0x2e, 0x63, 0x72, 0x65, 0x61, 0x74, 0x65, 0x45, 0x6c, 0x65, 0x6d,
+  0x65, 0x6e, 0x74, 0x28, 0x61, 0x2c, 0x68, 0x2e, 0x69, 0x73, 0x26, 0x26,
+  0x68, 0x29, 0x2c, 0x6f, 0x3d, 0x6e, 0x75, 0x6c, 0x6c, 0x2c, 0x75, 0x3d,
+  0x21, 0x31, 0x7d, 0x69, 0x66, 0x28, 0x6e, 0x75, 0x6c, 0x6c, 0x3d, 0x3d,
+  0x3d, 0x61, 0x29, 0x63, 0x3d, 0x3d, 0x3d, 0x68, 0x7c, 0x7c, 0x75, 0x26,
+  0x26, 0x74, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x3d, 0x3d, 0x3d, 0x68, 0x7c,
+  0x7c, 0x28, 0x74, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x3d, 0x68, 0x29, 0x3b,
+  0x65, 0x6c, 0x73, 0x65, 0x7b, 0x69, 0x66, 0x28, 0x6f, 0x3d, 0x6f, 0x26,
+  0x26, 0x6b, 0x2e, 0x63, 0x61, 0x6c, 0x6c, 0x28, 0x74, 0x2e, 0x63, 0x68,
+  0x69, 0x6c, 0x64, 0x4e, 0x6f, 0x64, 0x65, 0x73, 0x29, 0x2c, 0x66, 0x3d,
+  0x28, 0x63, 0x3d, 0x65, 0x2e, 0x70, 0x72, 0x6f, 0x70, 0x73, 0x7c, 0x7c,
+  0x50, 0x29, 0x2e, 0x64, 0x61, 0x6e, 0x67, 0x65, 0x72, 0x6f, 0x75, 0x73,
+  0x6c, 0x79, 0x53, 0x65, 0x74, 0x49, 0x6e, 0x6e, 0x65, 0x72, 0x48, 0x54,
+  0x4d, 0x4c, 0x2c, 0x73, 0x3d, 0x68, 0x2e, 0x64, 0x61, 0x6e, 0x67, 0x65,
+  0x72, 0x6f, 0x75, 0x73, 0x6c, 0x79, 0x53, 0x65, 0x74, 0x49, 0x6e, 0x6e,
+  0x65, 0x72, 0x48, 0x54, 0x4d, 0x4c, 0x2c, 0x21, 0x75, 0x29, 0x7b, 0x69,
+  0x66, 0x28, 0x6e, 0x75, 0x6c, 0x6c, 0x21, 0x3d, 0x6f, 0x29, 0x66, 0x6f,
+  0x72, 0x28, 0x63, 0x3d, 0x7b, 0x7d, 0x2c, 0x70, 0x3d, 0x30, 0x3b, 0x70,
+  0x3c, 0x74, 0x2e, 0x61, 0x74, 0x74, 0x72, 0x69, 0x62, 0x75, 0x74, 0x65,
+  0x73, 0x2e, 0x6c, 0x65, 0x6e, 0x67, 0x74, 0x68, 0x3b, 0x70, 0x2b, 0x2b,
+  0x29, 0x63, 0x5b, 0x74, 0x2e, 0x61, 0x74, 0x74, 0x72, 0x69, 0x62, 0x75,
+  0x74, 0x65, 0x73, 0x5b, 0x70, 0x5d, 0x2e, 0x6e, 0x61, 0x6d, 0x65, 0x5d,
+  0x3d, 0x74, 0x2e, 0x61, 0x74, 0x74, 0x72, 0x69, 0x62, 0x75, 0x74, 0x65,
+  0x73, 0x5b, 0x70, 0x5d, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x3b, 0x28,
+  0x73, 0x7c, 0x7c, 0x66, 0x29, 0x26, 0x26, 0x28, 0x73, 0x26, 0x26, 0x28,
+  0x66, 0x26, 0x26, 0x73, 0x2e, 0x5f, 0x5f, 0x68, 0x74, 0x6d, 0x6c, 0x3d,
+  0x3d, 0x66, 0x2e, 0x5f, 0x5f, 0x68, 0x74, 0x6d, 0x6c, 0x7c, 0x7c, 0x73,
+  0x2e, 0x5f, 0x5f, 0x68, 0x74, 0x6d, 0x6c, 0x3d, 0x3d, 0x3d, 0x74, 0x2e,
+  0x69, 0x6e, 0x6e, 0x65, 0x72, 0x48, 0x54, 0x4d, 0x4c, 0x29, 0x7c, 0x7c,
+  0x28, 0x74, 0x2e, 0x69, 0x6e, 0x6e, 0x65, 0x72, 0x48, 0x54, 0x4d, 0x4c,
+  0x3d, 0x73, 0x26, 0x26, 0x73, 0x2e, 0x5f, 0x5f, 0x68, 0x74, 0x6d, 0x6c,
+  0x7c, 0x7c, 0x22, 0x22, 0x29, 0x29, 0x7d, 0x69, 0x66, 0x28, 0x51, 0x28,
+  0x74, 0x2c, 0x68, 0x2c, 0x63, 0x2c, 0x5f, 0x2c, 0x75, 0x29, 0x2c, 0x73,
+  0x29, 0x6e, 0x2e, 0x5f, 0x5f, 0x6b, 0x3d, 0x5b, 0x5d, 0x3b, 0x65, 0x6c,
+  0x73, 0x65, 0x20, 0x69, 0x66, 0x28, 0x42, 0x28, 0x74, 0x2c, 0x54, 0x28,
+  0x70, 0x3d, 0x6e, 0x2e, 0x70, 0x72, 0x6f, 0x70, 0x73, 0x2e, 0x63, 0x68,
+  0x69, 0x6c, 0x64, 0x72, 0x65, 0x6e, 0x29, 0x3f, 0x70, 0x3a, 0x5b, 0x70,
+  0x5d, 0x2c, 0x6e, 0x2c, 0x65, 0x2c, 0x69, 0x2c, 0x5f, 0x26, 0x26, 0x22,
+  0x66, 0x6f, 0x72, 0x65, 0x69, 0x67, 0x6e, 0x4f, 0x62, 0x6a, 0x65, 0x63,
+  0x74, 0x22, 0x21, 0x3d, 0x3d, 0x61, 0x2c, 0x6f, 0x2c, 0x72, 0x2c, 0x6f,
+  0x3f, 0x6f, 0x5b, 0x30, 0x5d, 0x3a, 0x65, 0x2e, 0x5f, 0x5f, 0x6b, 0x26,
+  0x26, 0x52, 0x28, 0x65, 0x2c, 0x30, 0x29, 0x2c, 0x75, 0x29, 0x2c, 0x6e,
+  0x75, 0x6c, 0x6c, 0x21, 0x3d, 0x6f, 0x29, 0x66, 0x6f, 0x72, 0x28, 0x70,
+  0x3d, 0x6f, 0x2e, 0x6c, 0x65, 0x6e, 0x67, 0x74, 0x68, 0x3b, 0x70, 0x2d,
+  0x2d, 0x3b, 0x29, 0x6e, 0x75, 0x6c, 0x6c, 0x21, 0x3d, 0x6f, 0x5b, 0x70,
+  0x5d, 0x26, 0x26, 0x41, 0x28, 0x6f, 0x5b, 0x70, 0x5d, 0x29, 0x3b, 0x75,
+  0x7c, 0x7c, 0x28, 0x22, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x22, 0x69, 0x6e,
+  0x20, 0x68, 0x26, 0x26, 0x76, 0x6f, 0x69, 0x64, 0x20, 0x30, 0x21, 0x3d,
+  0x3d, 0x28, 0x70, 0x3d, 0x68, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x29,
+  0x26, 0x26, 0x28, 0x70, 0x21, 0x3d, 0x3d, 0x74, 0x2e, 0x76, 0x61, 0x6c,
+  0x75, 0x65, 0x7c, 0x7c, 0x22, 0x70, 0x72, 0x6f, 0x67, 0x72, 0x65, 0x73,
+  0x73, 0x22, 0x3d, 0x3d, 0x3d, 0x61, 0x26, 0x26, 0x21, 0x70, 0x7c, 0x7c,
+  0x22, 0x6f, 0x70, 0x74, 0x69, 0x6f, 0x6e, 0x22, 0x3d, 0x3d, 0x3d, 0x61,
+  0x26, 0x26, 0x70, 0x21, 0x3d, 0x3d, 0x63, 0x2e, 0x76, 0x61, 0x6c, 0x75,
+  0x65, 0x29, 0x26, 0x26, 0x59, 0x28, 0x74, 0x2c, 0x22, 0x76, 0x61, 0x6c,
+  0x75, 0x65, 0x22, 0x2c, 0x70, 0x2c, 0x63, 0x2e, 0x76, 0x61, 0x6c, 0x75,
+  0x65, 0x2c, 0x21, 0x31, 0x29, 0x2c, 0x22, 0x63, 0x68, 0x65, 0x63, 0x6b,
+  0x65, 0x64, 0x22, 0x69, 0x6e, 0x20, 0x68, 0x26, 0x26, 0x76, 0x6f, 0x69,
+  0x64, 0x20, 0x30, 0x21, 0x3d, 0x3d, 0x28, 0x70, 0x3d, 0x68, 0x2e, 0x63,
+  0x68, 0x65, 0x63, 0x6b, 0x65, 0x64, 0x29, 0x26, 0x26, 0x70, 0x21, 0x3d,
+  0x3d, 0x74, 0x2e, 0x63, 0x68, 0x65, 0x63, 0x6b, 0x65, 0x64, 0x26, 0x26,
+  0x59, 0x28, 0x74, 0x2c, 0x22, 0x63, 0x68, 0x65, 0x63, 0x6b, 0x65, 0x64,
+  0x22, 0x2c, 0x70, 0x2c, 0x63, 0x2e, 0x63, 0x68, 0x65, 0x63, 0x6b, 0x65,
+  0x64, 0x2c, 0x21, 0x31, 0x29, 0x29, 0x7d, 0x72, 0x65, 0x74, 0x75, 0x72,
+  0x6e, 0x20, 0x74, 0x7d, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e,
+  0x20, 0x5f, 0x74, 0x28, 0x74, 0x2c, 0x6e, 0x2c, 0x65, 0x29, 0x7b, 0x74,
+  0x72, 0x79, 0x7b, 0x22, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e,
+  0x22, 0x3d, 0x3d, 0x74, 0x79, 0x70, 0x65, 0x6f, 0x66, 0x20, 0x74, 0x3f,
+  0x74, 0x28, 0x6e, 0x29, 0x3a, 0x74, 0x2e, 0x63, 0x75, 0x72, 0x72, 0x65,
+  0x6e, 0x74, 0x3d, 0x6e, 0x7d, 0x63, 0x61, 0x74, 0x63, 0x68, 0x28, 0x74,
+  0x29, 0x7b, 0x53, 0x2e, 0x5f, 0x5f, 0x65, 0x28, 0x74, 0x2c, 0x65, 0x29,
+  0x7d, 0x7d, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x6f,
+  0x74, 0x28, 0x74, 0x2c, 0x6e, 0x2c, 0x65, 0x29, 0x7b, 0x76, 0x61, 0x72,
+  0x20, 0x69, 0x2c, 0x5f, 0x3b, 0x69, 0x66, 0x28, 0x53, 0x2e, 0x75, 0x6e,
+  0x6d, 0x6f, 0x75, 0x6e, 0x74, 0x26, 0x26, 0x53, 0x2e, 0x75, 0x6e, 0x6d,
+  0x6f, 0x75, 0x6e, 0x74, 0x28, 0x74, 0x29, 0x2c, 0x28, 0x69, 0x3d, 0x74,
+  0x2e, 0x72, 0x65, 0x66, 0x29, 0x26, 0x26, 0x28, 0x69, 0x2e, 0x63, 0x75,
+  0x72, 0x72, 0x65, 0x6e, 0x74, 0x26, 0x26, 0x69, 0x2e, 0x63, 0x75, 0x72,
+  0x72, 0x65, 0x6e, 0x74, 0x21, 0x3d, 0x3d, 0x74, 0x2e, 0x5f, 0x5f, 0x65,
+  0x7c, 0x7c, 0x5f, 0x74, 0x28, 0x69, 0x2c, 0x6e, 0x75, 0x6c, 0x6c, 0x2c,
+  0x6e, 0x29, 0x29, 0x2c, 0x6e, 0x75, 0x6c, 0x6c, 0x21, 0x3d, 0x28, 0x69,
+  0x3d, 0x74, 0x2e, 0x5f, 0x5f, 0x63, 0x29, 0x29, 0x7b, 0x69, 0x66, 0x28,
+  0x69, 0x2e, 0x63, 0x6f, 0x6d, 0x70, 0x6f, 0x6e, 0x65, 0x6e, 0x74, 0x57,
+  0x69, 0x6c, 0x6c, 0x55, 0x6e, 0x6d, 0x6f, 0x75, 0x6e, 0x74, 0x29, 0x74,
+  0x72, 0x79, 0x7b, 0x69, 0x2e, 0x63, 0x6f, 0x6d, 0x70, 0x6f, 0x6e, 0x65,
+  0x6e, 0x74, 0x57, 0x69, 0x6c, 0x6c, 0x55, 0x6e, 0x6d, 0x6f, 0x75, 0x6e,
+  0x74, 0x28, 0x29, 0x7d, 0x63, 0x61, 0x74, 0x63, 0x68, 0x28, 0x74, 0x29,
+  0x7b, 0x53, 0x2e, 0x5f, 0x5f, 0x65, 0x28, 0x74, 0x2c, 0x6e, 0x29, 0x7d,
+  0x69, 0x2e, 0x62, 0x61, 0x73, 0x65, 0x3d, 0x69, 0x2e, 0x5f, 0x5f, 0x50,
+  0x3d, 0x6e, 0x75, 0x6c, 0x6c, 0x2c, 0x74, 0x2e, 0x5f, 0x5f, 0x63, 0x3d,
+  0x76, 0x6f, 0x69, 0x64, 0x20, 0x30, 0x7d, 0x69, 0x66, 0x28, 0x69, 0x3d,
+  0x74, 0x2e, 0x5f, 0x5f, 0x6b, 0x29, 0x66, 0x6f, 0x72, 0x28, 0x5f, 0x3d,
+  0x30, 0x3b, 0x5f, 0x3c, 0x69, 0x2e, 0x6c, 0x65, 0x6e, 0x67, 0x74, 0x68,
+  0x3b, 0x5f, 0x2b, 0x2b, 0x29, 0x69, 0x5b, 0x5f, 0x5d, 0x26, 0x26, 0x6f,
+  0x74, 0x28, 0x69, 0x5b, 0x5f, 0x5d, 0x2c, 0x6e, 0x2c, 0x65, 0x7c, 0x7c,
+  0x22, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x22, 0x21, 0x3d,
+  0x74, 0x79, 0x70, 0x65, 0x6f, 0x66, 0x20, 0x74, 0x2e, 0x74, 0x79, 0x70,
+  0x65, 0x29, 0x3b, 0x65, 0x7c, 0x7c, 0x6e, 0x75, 0x6c, 0x6c, 0x3d, 0x3d,
+  0x74, 0x2e, 0x5f, 0x5f, 0x65, 0x7c, 0x7c, 0x41, 0x28, 0x74, 0x2e, 0x5f,
+  0x5f, 0x65, 0x29, 0x2c, 0x74, 0x2e, 0x5f, 0x5f, 0x3d, 0x74, 0x2e, 0x5f,
+  0x5f, 0x65, 0x3d, 0x74, 0x2e, 0x5f, 0x5f, 0x64, 0x3d, 0x76, 0x6f, 0x69,
+  0x64, 0x20, 0x30, 0x7d, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e,
+  0x20, 0x72, 0x74, 0x28, 0x74, 0x2c, 0x6e, 0x2c, 0x65, 0x29, 0x7b, 0x72,
+  0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x63,
+  0x6f, 0x6e, 0x73, 0x74, 0x72, 0x75, 0x63, 0x74, 0x6f, 0x72, 0x28, 0x74,
+  0x2c, 0x65, 0x29, 0x7d, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e,
+  0x20, 0x75, 0x74, 0x28, 0x74, 0x2c, 0x6e, 0x2c, 0x65, 0x29, 0x7b, 0x76,
+  0x61, 0x72, 0x20, 0x69, 0x2c, 0x5f, 0x2c, 0x6f, 0x3b, 0x53, 0x2e, 0x5f,
+  0x5f, 0x26, 0x26, 0x53, 0x2e, 0x5f, 0x5f, 0x28, 0x74, 0x2c, 0x6e, 0x29,
+  0x2c, 0x5f, 0x3d, 0x28, 0x69, 0x3d, 0x22, 0x66, 0x75, 0x6e, 0x63, 0x74,
+  0x69, 0x6f, 0x6e, 0x22, 0x3d, 0x3d, 0x74, 0x79, 0x70, 0x65, 0x6f, 0x66,
+  0x20, 0x65, 0x29, 0x3f, 0x6e, 0x75, 0x6c, 0x6c, 0x3a, 0x65, 0x26, 0x26,
+  0x65, 0x2e, 0x5f, 0x5f, 0x6b, 0x7c, 0x7c, 0x6e, 0x2e, 0x5f, 0x5f, 0x6b,
+  0x2c, 0x6f, 0x3d, 0x5b, 0x5d, 0x2c, 0x6e, 0x74, 0x28, 0x6e, 0x2c, 0x74,
+  0x3d, 0x28, 0x21, 0x69, 0x26, 0x26, 0x65, 0x7c, 0x7c, 0x6e, 0x29, 0x2e,
+  0x5f, 0x5f, 0x6b, 0x3d, 0x46, 0x28, 0x4f, 0x2c, 0x6e, 0x75, 0x6c, 0x6c,
+  0x2c, 0x5b, 0x74, 0x5d, 0x29, 0x2c, 0x5f, 0x7c, 0x7c, 0x50, 0x2c, 0x50,
+  0x2c, 0x76, 0x6f, 0x69, 0x64, 0x20, 0x30, 0x21, 0x3d, 0x3d, 0x6e, 0x2e,
+  0x6f, 0x77, 0x6e, 0x65, 0x72, 0x53, 0x56, 0x47, 0x45, 0x6c, 0x65, 0x6d,
+  0x65, 0x6e, 0x74, 0x2c, 0x21, 0x69, 0x26, 0x26, 0x65, 0x3f, 0x5b, 0x65,
+  0x5d, 0x3a, 0x5f, 0x3f, 0x6e, 0x75, 0x6c, 0x6c, 0x3a, 0x6e, 0x2e, 0x66,
+  0x69, 0x72, 0x73, 0x74, 0x43, 0x68, 0x69, 0x6c, 0x64, 0x3f, 0x6b, 0x2e,
+  0x63, 0x61, 0x6c, 0x6c, 0x28, 0x6e, 0x2e, 0x63, 0x68, 0x69, 0x6c, 0x64,
+  0x4e, 0x6f, 0x64, 0x65, 0x73, 0x29, 0x3a, 0x6e, 0x75, 0x6c, 0x6c, 0x2c,
+  0x6f, 0x2c, 0x21, 0x69, 0x26, 0x26, 0x65, 0x3f, 0x65, 0x3a, 0x5f, 0x3f,
+  0x5f, 0x2e, 0x5f, 0x5f, 0x65, 0x3a, 0x6e, 0x2e, 0x66, 0x69, 0x72, 0x73,
+  0x74, 0x43, 0x68, 0x69, 0x6c, 0x64, 0x2c, 0x69, 0x29, 0x2c, 0x65, 0x74,
+  0x28, 0x6f, 0x2c, 0x74, 0x29, 0x7d, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69,
+  0x6f, 0x6e, 0x20, 0x6c, 0x74, 0x28, 0x74, 0x2c, 0x6e, 0x29, 0x7b, 0x75,
+  0x74, 0x28, 0x74, 0x2c, 0x6e, 0x2c, 0x6c, 0x74, 0x29, 0x7d, 0x66, 0x75,
+  0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x66, 0x74, 0x28, 0x74, 0x2c,
+  0x6e, 0x2c, 0x65, 0x29, 0x7b, 0x76, 0x61, 0x72, 0x20, 0x69, 0x2c, 0x5f,
+  0x2c, 0x6f, 0x2c, 0x72, 0x2c, 0x75, 0x3d, 0x56, 0x28, 0x7b, 0x7d, 0x2c,
+  0x74, 0x2e, 0x70, 0x72, 0x6f, 0x70, 0x73, 0x29, 0x3b, 0x66, 0x6f, 0x72,
+  0x28, 0x6f, 0x20, 0x69, 0x6e, 0x20, 0x74, 0x2e, 0x74, 0x79, 0x70, 0x65,
+  0x26, 0x26, 0x74, 0x2e, 0x74, 0x79, 0x70, 0x65, 0x2e, 0x64, 0x65, 0x66,
+  0x61, 0x75, 0x6c, 0x74, 0x50, 0x72, 0x6f, 0x70, 0x73, 0x26, 0x26, 0x28,
+  0x72, 0x3d, 0x74, 0x2e, 0x74, 0x79, 0x70, 0x65, 0x2e, 0x64, 0x65, 0x66,
+  0x61, 0x75, 0x6c, 0x74, 0x50, 0x72, 0x6f, 0x70, 0x73, 0x29, 0x2c, 0x6e,
+  0x29, 0x22, 0x6b, 0x65, 0x79, 0x22, 0x3d, 0x3d, 0x6f, 0x3f, 0x69, 0x3d,
+  0x6e, 0x5b, 0x6f, 0x5d, 0x3a, 0x22, 0x72, 0x65, 0x66, 0x22, 0x3d, 0x3d,
+  0x6f, 0x3f, 0x5f, 0x3d, 0x6e, 0x5b, 0x6f, 0x5d, 0x3a, 0x75, 0x5b, 0x6f,
+  0x5d, 0x3d, 0x76, 0x6f, 0x69, 0x64, 0x20, 0x30, 0x3d, 0x3d, 0x3d, 0x6e,
+  0x5b, 0x6f, 0x5d, 0x26, 0x26, 0x76, 0x6f, 0x69, 0x64, 0x20, 0x30, 0x21,
+  0x3d, 0x3d, 0x72, 0x3f, 0x72, 0x5b, 0x6f, 0x5d, 0x3a, 0x6e, 0x5b, 0x6f,
+  0x5d, 0x3b, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x61, 0x72, 0x67,
+  0x75, 0x6d, 0x65, 0x6e, 0x74, 0x73, 0x2e, 0x6c, 0x65, 0x6e, 0x67, 0x74,
+  0x68, 0x3e, 0x32, 0x26, 0x26, 0x28, 0x75, 0x2e, 0x63, 0x68, 0x69, 0x6c,
+  0x64, 0x72, 0x65, 0x6e, 0x3d, 0x61, 0x72, 0x67, 0x75, 0x6d, 0x65, 0x6e,
+  0x74, 0x73, 0x2e, 0x6c, 0x65, 0x6e, 0x67, 0x74, 0x68, 0x3e, 0x33, 0x3f,
+  0x6b, 0x2e, 0x63, 0x61, 0x6c, 0x6c, 0x28, 0x61, 0x72, 0x67, 0x75, 0x6d,
+  0x65, 0x6e, 0x74, 0x73, 0x2c, 0x32, 0x29, 0x3a, 0x65, 0x29, 0x2c, 0x4d,
+  0x28, 0x74, 0x2e, 0x74, 0x79, 0x70, 0x65, 0x2c, 0x75, 0x2c, 0x69, 0x7c,
+  0x7c, 0x74, 0x2e, 0x6b, 0x65, 0x79, 0x2c, 0x5f, 0x7c, 0x7c, 0x74, 0x2e,
+  0x72, 0x65, 0x66, 0x2c, 0x6e, 0x75, 0x6c, 0x6c, 0x29, 0x7d, 0x66, 0x75,
+  0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x73, 0x74, 0x28, 0x74, 0x2c,
+  0x6e, 0x29, 0x7b, 0x76, 0x61, 0x72, 0x20, 0x65, 0x3d, 0x7b, 0x5f, 0x5f,
+  0x63, 0x3a, 0x6e, 0x3d, 0x22, 0x5f, 0x5f, 0x63, 0x43, 0x22, 0x2b, 0x4e,
+  0x2b, 0x2b, 0x2c, 0x5f, 0x5f, 0x3a, 0x74, 0x2c, 0x43, 0x6f, 0x6e, 0x73,
+  0x75, 0x6d, 0x65, 0x72, 0x3a, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f,
+  0x6e, 0x28, 0x74, 0x2c, 0x6e, 0x29, 0x7b, 0x72, 0x65, 0x74, 0x75, 0x72,
+  0x6e, 0x20, 0x74, 0x2e, 0x63, 0x68, 0x69, 0x6c, 0x64, 0x72, 0x65, 0x6e,
+  0x28, 0x6e, 0x29, 0x7d, 0x2c, 0x50, 0x72, 0x6f, 0x76, 0x69, 0x64, 0x65,
+  0x72, 0x3a, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x74,
+  0x29, 0x7b, 0x76, 0x61, 0x72, 0x20, 0x65, 0x2c, 0x69, 0x3b, 0x72, 0x65,
+  0x74, 0x75, 0x72, 0x6e, 0x20, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x67, 0x65,
+  0x74, 0x43, 0x68, 0x69, 0x6c, 0x64, 0x43, 0x6f, 0x6e, 0x74, 0x65, 0x78,
+  0x74, 0x7c, 0x7c, 0x28, 0x65, 0x3d, 0x5b, 0x5d, 0x2c, 0x28, 0x69, 0x3d,
+  0x7b, 0x7d, 0x29, 0x5b, 0x6e, 0x5d, 0x3d, 0x74, 0x68, 0x69, 0x73, 0x2c,
+  0x74, 0x68, 0x69, 0x73, 0x2e, 0x67, 0x65, 0x74, 0x43, 0x68, 0x69, 0x6c,
+  0x64, 0x43, 0x6f, 0x6e, 0x74, 0x65, 0x78, 0x74, 0x3d, 0x66, 0x75, 0x6e,
+  0x63, 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x29, 0x7b, 0x72, 0x65, 0x74, 0x75,
+  0x72, 0x6e, 0x20, 0x69, 0x7d, 0x2c, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x73,
+  0x68, 0x6f, 0x75, 0x6c, 0x64, 0x43, 0x6f, 0x6d, 0x70, 0x6f, 0x6e, 0x65,
+  0x6e, 0x74, 0x55, 0x70, 0x64, 0x61, 0x74, 0x65, 0x3d, 0x66, 0x75, 0x6e,
+  0x63, 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x74, 0x29, 0x7b, 0x74, 0x68, 0x69,
+  0x73, 0x2e, 0x70, 0x72, 0x6f, 0x70, 0x73, 0x2e, 0x76, 0x61, 0x6c, 0x75,
+  0x65, 0x21, 0x3d, 0x3d, 0x74, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x26,
+  0x26, 0x65, 0x2e, 0x73, 0x6f, 0x6d, 0x65, 0x28, 0x28, 0x66, 0x75, 0x6e,
+  0x63, 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x74, 0x29, 0x7b, 0x74, 0x2e, 0x5f,
+  0x5f, 0x65, 0x3d, 0x21, 0x30, 0x2c, 0x6a, 0x28, 0x74, 0x29, 0x7d, 0x29,
+  0x29, 0x7d, 0x2c, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x73, 0x75, 0x62, 0x3d,
+  0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x74, 0x29, 0x7b,
+  0x65, 0x2e, 0x70, 0x75, 0x73, 0x68, 0x28, 0x74, 0x29, 0x3b, 0x76, 0x61,
+  0x72, 0x20, 0x6e, 0x3d, 0x74, 0x2e, 0x63, 0x6f, 0x6d, 0x70, 0x6f, 0x6e,
+  0x65, 0x6e, 0x74, 0x57, 0x69, 0x6c, 0x6c, 0x55, 0x6e, 0x6d, 0x6f, 0x75,
+  0x6e, 0x74, 0x3b, 0x74, 0x2e, 0x63, 0x6f, 0x6d, 0x70, 0x6f, 0x6e, 0x65,
+  0x6e, 0x74, 0x57, 0x69, 0x6c, 0x6c, 0x55, 0x6e, 0x6d, 0x6f, 0x75, 0x6e,
+  0x74, 0x3d, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x29,
+  0x7b, 0x65, 0x2e, 0x73, 0x70, 0x6c, 0x69, 0x63, 0x65, 0x28, 0x65, 0x2e,
+  0x69, 0x6e, 0x64, 0x65, 0x78, 0x4f, 0x66, 0x28, 0x74, 0x29, 0x2c, 0x31,
+  0x29, 0x2c, 0x6e, 0x26, 0x26, 0x6e, 0x2e, 0x63, 0x61, 0x6c, 0x6c, 0x28,
+  0x74, 0x29, 0x7d, 0x7d, 0x29, 0x2c, 0x74, 0x2e, 0x63, 0x68, 0x69, 0x6c,
+  0x64, 0x72, 0x65, 0x6e, 0x7d, 0x7d, 0x3b, 0x72, 0x65, 0x74, 0x75, 0x72,
+  0x6e, 0x20, 0x65, 0x2e, 0x50, 0x72, 0x6f, 0x76, 0x69, 0x64, 0x65, 0x72,
+  0x2e, 0x5f, 0x5f, 0x3d, 0x65, 0x2e, 0x43, 0x6f, 0x6e, 0x73, 0x75, 0x6d,
+  0x65, 0x72, 0x2e, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x78, 0x74, 0x54, 0x79,
+  0x70, 0x65, 0x3d, 0x65, 0x7d, 0x6b, 0x3d, 0x44, 0x2e, 0x73, 0x6c, 0x69,
+  0x63, 0x65, 0x2c, 0x53, 0x3d, 0x7b, 0x5f, 0x5f, 0x65, 0x3a, 0x66, 0x75,
+  0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x74, 0x2c, 0x6e, 0x2c, 0x65,
+  0x2c, 0x69, 0x29, 0x7b, 0x66, 0x6f, 0x72, 0x28, 0x76, 0x61, 0x72, 0x20,
+  0x5f, 0x2c, 0x6f, 0x2c, 0x72, 0x3b, 0x6e, 0x3d, 0x6e, 0x2e, 0x5f, 0x5f,
+  0x3b, 0x29, 0x69, 0x66, 0x28, 0x28, 0x5f, 0x3d, 0x6e, 0x2e, 0x5f, 0x5f,
+  0x63, 0x29, 0x26, 0x26, 0x21, 0x5f, 0x2e, 0x5f, 0x5f, 0x29, 0x74, 0x72,
+  0x79, 0x7b, 0x69, 0x66, 0x28, 0x28, 0x6f, 0x3d, 0x5f, 0x2e, 0x63, 0x6f,
+  0x6e, 0x73, 0x74, 0x72, 0x75, 0x63, 0x74, 0x6f, 0x72, 0x29, 0x26, 0x26,
+  0x6e, 0x75, 0x6c, 0x6c, 0x21, 0x3d, 0x6f, 0x2e, 0x67, 0x65, 0x74, 0x44,
+  0x65, 0x72, 0x69, 0x76, 0x65, 0x64, 0x53, 0x74, 0x61, 0x74, 0x65, 0x46,
+  0x72, 0x6f, 0x6d, 0x45, 0x72, 0x72, 0x6f, 0x72, 0x26, 0x26, 0x28, 0x5f,
+  0x2e, 0x73, 0x65, 0x74, 0x53, 0x74, 0x61, 0x74, 0x65, 0x28, 0x6f, 0x2e,
+  0x67, 0x65, 0x74, 0x44, 0x65, 0x72, 0x69, 0x76, 0x65, 0x64, 0x53, 0x74,
+  0x61, 0x74, 0x65, 0x46, 0x72, 0x6f, 0x6d, 0x45, 0x72, 0x72, 0x6f, 0x72,
+  0x28, 0x74, 0x29, 0x29, 0x2c, 0x72, 0x3d, 0x5f, 0x2e, 0x5f, 0x5f, 0x64,
+  0x29, 0x2c, 0x6e, 0x75, 0x6c, 0x6c, 0x21, 0x3d, 0x5f, 0x2e, 0x63, 0x6f,
+  0x6d, 0x70, 0x6f, 0x6e, 0x65, 0x6e, 0x74, 0x44, 0x69, 0x64, 0x43, 0x61,
+  0x74, 0x63, 0x68, 0x26, 0x26, 0x28, 0x5f, 0x2e, 0x63, 0x6f, 0x6d, 0x70,
+  0x6f, 0x6e, 0x65, 0x6e, 0x74, 0x44, 0x69, 0x64, 0x43, 0x61, 0x74, 0x63,
+  0x68, 0x28, 0x74, 0x2c, 0x69, 0x7c, 0x7c, 0x7b, 0x7d, 0x29, 0x2c, 0x72,
+  0x3d, 0x5f, 0x2e, 0x5f, 0x5f, 0x64, 0x29, 0x2c, 0x72, 0x29, 0x72, 0x65,
+  0x74, 0x75, 0x72, 0x6e, 0x20, 0x5f, 0x2e, 0x5f, 0x5f, 0x45, 0x3d, 0x5f,
+  0x7d, 0x63, 0x61, 0x74, 0x63, 0x68, 0x28, 0x6e, 0x29, 0x7b, 0x74, 0x3d,
+  0x6e, 0x7d, 0x74, 0x68, 0x72, 0x6f, 0x77, 0x20, 0x74, 0x7d, 0x7d, 0x2c,
+  0x78, 0x3d, 0x30, 0x2c, 0x77, 0x3d, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69,
+  0x6f, 0x6e, 0x28, 0x74, 0x29, 0x7b, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e,
+  0x20, 0x6e, 0x75, 0x6c, 0x6c, 0x21, 0x3d, 0x74, 0x26, 0x26, 0x76, 0x6f,
+  0x69, 0x64, 0x20, 0x30, 0x3d, 0x3d, 0x3d, 0x74, 0x2e, 0x63, 0x6f, 0x6e,
+  0x73, 0x74, 0x72, 0x75, 0x63, 0x74, 0x6f, 0x72, 0x7d, 0x2c, 0x4c, 0x2e,
+  0x70, 0x72, 0x6f, 0x74, 0x6f, 0x74, 0x79, 0x70, 0x65, 0x2e, 0x73, 0x65,
+  0x74, 0x53, 0x74, 0x61, 0x74, 0x65, 0x3d, 0x66, 0x75, 0x6e, 0x63, 0x74,
+  0x69, 0x6f, 0x6e, 0x28, 0x74, 0x2c, 0x6e, 0x29, 0x7b, 0x76, 0x61, 0x72,
+  0x20, 0x65, 0x3b, 0x65, 0x3d, 0x6e, 0x75, 0x6c, 0x6c, 0x21, 0x3d, 0x74,
+  0x68, 0x69, 0x73, 0x2e, 0x5f, 0x5f, 0x73, 0x26, 0x26, 0x74, 0x68, 0x69,
+  0x73, 0x2e, 0x5f, 0x5f, 0x73, 0x21, 0x3d, 0x3d, 0x74, 0x68, 0x69, 0x73,
+  0x2e, 0x73, 0x74, 0x61, 0x74, 0x65, 0x3f, 0x74, 0x68, 0x69, 0x73, 0x2e,
+  0x5f, 0x5f, 0x73, 0x3a, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x5f, 0x5f, 0x73,
+  0x3d, 0x56, 0x28, 0x7b, 0x7d, 0x2c, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x73,
+  0x74, 0x61, 0x74, 0x65, 0x29, 0x2c, 0x22, 0x66, 0x75, 0x6e, 0x63, 0x74,
+  0x69, 0x6f, 0x6e, 0x22, 0x3d, 0x3d, 0x74, 0x79, 0x70, 0x65, 0x6f, 0x66,
+  0x20, 0x74, 0x26, 0x26, 0x28, 0x74, 0x3d, 0x74, 0x28, 0x56, 0x28, 0x7b,
+  0x7d, 0x2c, 0x65, 0x29, 0x2c, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x70, 0x72,
+  0x6f, 0x70, 0x73, 0x29, 0x29, 0x2c, 0x74, 0x26, 0x26, 0x56, 0x28, 0x65,
+  0x2c, 0x74, 0x29, 0x2c, 0x6e, 0x75, 0x6c, 0x6c, 0x21, 0x3d, 0x74, 0x26,
+  0x26, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x5f, 0x5f, 0x76, 0x26, 0x26, 0x28,
+  0x6e, 0x26, 0x26, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x5f, 0x73, 0x62, 0x2e,
+  0x70, 0x75, 0x73, 0x68, 0x28, 0x6e, 0x29, 0x2c, 0x6a, 0x28, 0x74, 0x68,
+  0x69, 0x73, 0x29, 0x29, 0x7d, 0x2c, 0x4c, 0x2e, 0x70, 0x72, 0x6f, 0x74,
+  0x6f, 0x74, 0x79, 0x70, 0x65, 0x2e, 0x66, 0x6f, 0x72, 0x63, 0x65, 0x55,
+  0x70, 0x64, 0x61, 0x74, 0x65, 0x3d, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69,
+  0x6f, 0x6e, 0x28, 0x74, 0x29, 0x7b, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x5f,
+  0x5f, 0x76, 0x26, 0x26, 0x28, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x5f, 0x5f,
+  0x65, 0x3d, 0x21, 0x30, 0x2c, 0x74, 0x26, 0x26, 0x74, 0x68, 0x69, 0x73,
+  0x2e, 0x5f, 0x5f, 0x68, 0x2e, 0x70, 0x75, 0x73, 0x68, 0x28, 0x74, 0x29,
+  0x2c, 0x6a, 0x28, 0x74, 0x68, 0x69, 0x73, 0x29, 0x29, 0x7d, 0x2c, 0x4c,
+  0x2e, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x74, 0x79, 0x70, 0x65, 0x2e, 0x72,
+  0x65, 0x6e, 0x64, 0x65, 0x72, 0x3d, 0x4f, 0x2c, 0x43, 0x3d, 0x5b, 0x5d,
+  0x2c, 0x55, 0x3d, 0x22, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e,
+  0x22, 0x3d, 0x3d, 0x74, 0x79, 0x70, 0x65, 0x6f, 0x66, 0x20, 0x50, 0x72,
+  0x6f, 0x6d, 0x69, 0x73, 0x65, 0x3f, 0x50, 0x72, 0x6f, 0x6d, 0x69, 0x73,
+  0x65, 0x2e, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x74, 0x79, 0x70, 0x65, 0x2e,
+  0x74, 0x68, 0x65, 0x6e, 0x2e, 0x62, 0x69, 0x6e, 0x64, 0x28, 0x50, 0x72,
+  0x6f, 0x6d, 0x69, 0x73, 0x65, 0x2e, 0x72, 0x65, 0x73, 0x6f, 0x6c, 0x76,
+  0x65, 0x28, 0x29, 0x29, 0x3a, 0x73, 0x65, 0x74, 0x54, 0x69, 0x6d, 0x65,
+  0x6f, 0x75, 0x74, 0x2c, 0x48, 0x3d, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69,
+  0x6f, 0x6e, 0x28, 0x74, 0x2c, 0x6e, 0x29, 0x7b, 0x72, 0x65, 0x74, 0x75,
+  0x72, 0x6e, 0x20, 0x74, 0x2e, 0x5f, 0x5f, 0x76, 0x2e, 0x5f, 0x5f, 0x62,
+  0x2d, 0x6e, 0x2e, 0x5f, 0x5f, 0x76, 0x2e, 0x5f, 0x5f, 0x62, 0x7d, 0x2c,
+  0x71, 0x2e, 0x5f, 0x5f, 0x72, 0x3d, 0x30, 0x2c, 0x4e, 0x3d, 0x30, 0x3b,
+  0x76, 0x61, 0x72, 0x20, 0x63, 0x74, 0x2c, 0x68, 0x74, 0x2c, 0x61, 0x74,
+  0x2c, 0x70, 0x74, 0x2c, 0x64, 0x74, 0x3d, 0x30, 0x2c, 0x76, 0x74, 0x3d,
+  0x5b, 0x5d, 0x2c, 0x79, 0x74, 0x3d, 0x5b, 0x5d, 0x2c, 0x6d, 0x74, 0x3d,
+  0x53, 0x2e, 0x5f, 0x5f, 0x62, 0x2c, 0x67, 0x74, 0x3d, 0x53, 0x2e, 0x5f,
+  0x5f, 0x72, 0x2c, 0x62, 0x74, 0x3d, 0x53, 0x2e, 0x64, 0x69, 0x66, 0x66,
+  0x65, 0x64, 0x2c, 0x6b, 0x74, 0x3d, 0x53, 0x2e, 0x5f, 0x5f, 0x63, 0x2c,
+  0x53, 0x74, 0x3d, 0x53, 0x2e, 0x75, 0x6e, 0x6d, 0x6f, 0x75, 0x6e, 0x74,
+  0x3b, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x78, 0x74,
+  0x28, 0x74, 0x2c, 0x6e, 0x29, 0x7b, 0x53, 0x2e, 0x5f, 0x5f, 0x68, 0x26,
+  0x26, 0x53, 0x2e, 0x5f, 0x5f, 0x68, 0x28, 0x68, 0x74, 0x2c, 0x74, 0x2c,
+  0x64, 0x74, 0x7c, 0x7c, 0x6e, 0x29, 0x2c, 0x64, 0x74, 0x3d, 0x30, 0x3b,
+  0x76, 0x61, 0x72, 0x20, 0x65, 0x3d, 0x68, 0x74, 0x2e, 0x5f, 0x5f, 0x48,
+  0x7c, 0x7c, 0x28, 0x68, 0x74, 0x2e, 0x5f, 0x5f, 0x48, 0x3d, 0x7b, 0x5f,
+  0x5f, 0x3a, 0x5b, 0x5d, 0x2c, 0x5f, 0x5f, 0x68, 0x3a, 0x5b, 0x5d, 0x7d,
+  0x29, 0x3b, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x74, 0x3e, 0x3d,
+  0x65, 0x2e, 0x5f, 0x5f, 0x2e, 0x6c, 0x65, 0x6e, 0x67, 0x74, 0x68, 0x26,
+  0x26, 0x65, 0x2e, 0x5f, 0x5f, 0x2e, 0x70, 0x75, 0x73, 0x68, 0x28, 0x7b,
+  0x5f, 0x5f, 0x56, 0x3a, 0x79, 0x74, 0x7d, 0x29, 0x2c, 0x65, 0x2e, 0x5f,
+  0x5f, 0x5b, 0x74, 0x5d, 0x7d, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f,
+  0x6e, 0x20, 0x77, 0x74, 0x28, 0x74, 0x29, 0x7b, 0x72, 0x65, 0x74, 0x75,
+  0x72, 0x6e, 0x20, 0x64, 0x74, 0x3d, 0x31, 0x2c, 0x43, 0x74, 0x28, 0x49,
+  0x74, 0x2c, 0x74, 0x29, 0x7d, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f,
+  0x6e, 0x20, 0x43, 0x74, 0x28, 0x74, 0x2c, 0x6e, 0x2c, 0x65, 0x29, 0x7b,
+  0x76, 0x61, 0x72, 0x20, 0x69, 0x3d, 0x78, 0x74, 0x28, 0x63, 0x74, 0x2b,
+  0x2b, 0x2c, 0x32, 0x29, 0x3b, 0x69, 0x66, 0x28, 0x69, 0x2e, 0x74, 0x3d,
+  0x74, 0x2c, 0x21, 0x69, 0x2e, 0x5f, 0x5f, 0x63, 0x26, 0x26, 0x28, 0x69,
+  0x2e, 0x5f, 0x5f, 0x3d, 0x5b, 0x65, 0x3f, 0x65, 0x28, 0x6e, 0x29, 0x3a,
+  0x49, 0x74, 0x28, 0x76, 0x6f, 0x69, 0x64, 0x20, 0x30, 0x2c, 0x6e, 0x29,
+  0x2c, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x74, 0x29,
+  0x7b, 0x76, 0x61, 0x72, 0x20, 0x6e, 0x3d, 0x69, 0x2e, 0x5f, 0x5f, 0x4e,
+  0x3f, 0x69, 0x2e, 0x5f, 0x5f, 0x4e, 0x5b, 0x30, 0x5d, 0x3a, 0x69, 0x2e,
+  0x5f, 0x5f, 0x5b, 0x30, 0x5d, 0x2c, 0x65, 0x3d, 0x69, 0x2e, 0x74, 0x28,
+  0x6e, 0x2c, 0x74, 0x29, 0x3b, 0x6e, 0x21, 0x3d, 0x3d, 0x65, 0x26, 0x26,
+  0x28, 0x69, 0x2e, 0x5f, 0x5f, 0x4e, 0x3d, 0x5b, 0x65, 0x2c, 0x69, 0x2e,
+  0x5f, 0x5f, 0x5b, 0x31, 0x5d, 0x5d, 0x2c, 0x69, 0x2e, 0x5f, 0x5f, 0x63,
+  0x2e, 0x73, 0x65, 0x74, 0x53, 0x74, 0x61, 0x74, 0x65, 0x28, 0x7b, 0x7d,
+  0x29, 0x29, 0x7d, 0x5d, 0x2c, 0x69, 0x2e, 0x5f, 0x5f, 0x63, 0x3d, 0x68,
+  0x74, 0x2c, 0x21, 0x68, 0x74, 0x2e, 0x75, 0x29, 0x29, 0x7b, 0x76, 0x61,
+  0x72, 0x20, 0x5f, 0x3d, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e,
+  0x28, 0x74, 0x2c, 0x6e, 0x2c, 0x65, 0x29, 0x7b, 0x69, 0x66, 0x28, 0x21,
+  0x69, 0x2e, 0x5f, 0x5f, 0x63, 0x2e, 0x5f, 0x5f, 0x48, 0x29, 0x72, 0x65,
+  0x74, 0x75, 0x72, 0x6e, 0x21, 0x30, 0x3b, 0x76, 0x61, 0x72, 0x20, 0x5f,
+  0x3d, 0x69, 0x2e, 0x5f, 0x5f, 0x63, 0x2e, 0x5f, 0x5f, 0x48, 0x2e, 0x5f,
+  0x5f, 0x2e, 0x66, 0x69, 0x6c, 0x74, 0x65, 0x72, 0x28, 0x28, 0x66, 0x75,
+  0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x74, 0x29, 0x7b, 0x72, 0x65,
+  0x74, 0x75, 0x72, 0x6e, 0x20, 0x74, 0x2e, 0x5f, 0x5f, 0x63, 0x7d, 0x29,
+  0x29, 0x3b, 0x69, 0x66, 0x28, 0x5f, 0x2e, 0x65, 0x76, 0x65, 0x72, 0x79,
+  0x28, 0x28, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x74,
+  0x29, 0x7b, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x21, 0x74, 0x2e, 0x5f,
+  0x5f, 0x4e, 0x7d, 0x29, 0x29, 0x29, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e,
+  0x21, 0x6f, 0x7c, 0x7c, 0x6f, 0x2e, 0x63, 0x61, 0x6c, 0x6c, 0x28, 0x74,
+  0x68, 0x69, 0x73, 0x2c, 0x74, 0x2c, 0x6e, 0x2c, 0x65, 0x29, 0x3b, 0x76,
+  0x61, 0x72, 0x20, 0x72, 0x3d, 0x21, 0x31, 0x3b, 0x72, 0x65, 0x74, 0x75,
+  0x72, 0x6e, 0x20, 0x5f, 0x2e, 0x66, 0x6f, 0x72, 0x45, 0x61, 0x63, 0x68,
+  0x28, 0x28, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x74,
+  0x29, 0x7b, 0x69, 0x66, 0x28, 0x74, 0x2e, 0x5f, 0x5f, 0x4e, 0x29, 0x7b,
+  0x76, 0x61, 0x72, 0x20, 0x6e, 0x3d, 0x74, 0x2e, 0x5f, 0x5f, 0x5b, 0x30,
+  0x5d, 0x3b, 0x74, 0x2e, 0x5f, 0x5f, 0x3d, 0x74, 0x2e, 0x5f, 0x5f, 0x4e,
+  0x2c, 0x74, 0x2e, 0x5f, 0x5f, 0x4e, 0x3d, 0x76, 0x6f, 0x69, 0x64, 0x20,
+  0x30, 0x2c, 0x6e, 0x21, 0x3d, 0x3d, 0x74, 0x2e, 0x5f, 0x5f, 0x5b, 0x30,
+  0x5d, 0x26, 0x26, 0x28, 0x72, 0x3d, 0x21, 0x30, 0x29, 0x7d, 0x7d, 0x29,
+  0x29, 0x2c, 0x21, 0x28, 0x21, 0x72, 0x26, 0x26, 0x69, 0x2e, 0x5f, 0x5f,
+  0x63, 0x2e, 0x70, 0x72, 0x6f, 0x70, 0x73, 0x3d, 0x3d, 0x3d, 0x74, 0x29,
+  0x26, 0x26, 0x28, 0x21, 0x6f, 0x7c, 0x7c, 0x6f, 0x2e, 0x63, 0x61, 0x6c,
+  0x6c, 0x28, 0x74, 0x68, 0x69, 0x73, 0x2c, 0x74, 0x2c, 0x6e, 0x2c, 0x65,
+  0x29, 0x29, 0x7d, 0x3b, 0x68, 0x74, 0x2e, 0x75, 0x3d, 0x21, 0x30, 0x3b,
+  0x76, 0x61, 0x72, 0x20, 0x6f, 0x3d, 0x68, 0x74, 0x2e, 0x73, 0x68, 0x6f,
+  0x75, 0x6c, 0x64, 0x43, 0x6f, 0x6d, 0x70, 0x6f, 0x6e, 0x65, 0x6e, 0x74,
+  0x55, 0x70, 0x64, 0x61, 0x74, 0x65, 0x2c, 0x72, 0x3d, 0x68, 0x74, 0x2e,
+  0x63, 0x6f, 0x6d, 0x70, 0x6f, 0x6e, 0x65, 0x6e, 0x74, 0x57, 0x69, 0x6c,
+  0x6c, 0x55, 0x70, 0x64, 0x61, 0x74, 0x65, 0x3b, 0x68, 0x74, 0x2e, 0x63,
+  0x6f, 0x6d, 0x70, 0x6f, 0x6e, 0x65, 0x6e, 0x74, 0x57, 0x69, 0x6c, 0x6c,
+  0x55, 0x70, 0x64, 0x61, 0x74, 0x65, 0x3d, 0x66, 0x75, 0x6e, 0x63, 0x74,
+  0x69, 0x6f, 0x6e, 0x28, 0x74, 0x2c, 0x6e, 0x2c, 0x65, 0x29, 0x7b, 0x69,
+  0x66, 0x28, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x5f, 0x5f, 0x65, 0x29, 0x7b,
+  0x76, 0x61, 0x72, 0x20, 0x69, 0x3d, 0x6f, 0x3b, 0x6f, 0x3d, 0x76, 0x6f,
+  0x69, 0x64, 0x20, 0x30, 0x2c, 0x5f, 0x28, 0x74, 0x2c, 0x6e, 0x2c, 0x65,
+  0x29, 0x2c, 0x6f, 0x3d, 0x69, 0x7d, 0x72, 0x26, 0x26, 0x72, 0x2e, 0x63,
+  0x61, 0x6c, 0x6c, 0x28, 0x74, 0x68, 0x69, 0x73, 0x2c, 0x74, 0x2c, 0x6e,
+  0x2c, 0x65, 0x29, 0x7d, 0x2c, 0x68, 0x74, 0x2e, 0x73, 0x68, 0x6f, 0x75,
+  0x6c, 0x64, 0x43, 0x6f, 0x6d, 0x70, 0x6f, 0x6e, 0x65, 0x6e, 0x74, 0x55,
+  0x70, 0x64, 0x61, 0x74, 0x65, 0x3d, 0x5f, 0x7d, 0x72, 0x65, 0x74, 0x75,
+  0x72, 0x6e, 0x20, 0x69, 0x2e, 0x5f, 0x5f, 0x4e, 0x7c, 0x7c, 0x69, 0x2e,
+  0x5f, 0x5f, 0x7d, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20,
+  0x45, 0x74, 0x28, 0x74, 0x2c, 0x6e, 0x29, 0x7b, 0x76, 0x61, 0x72, 0x20,
+  0x65, 0x3d, 0x78, 0x74, 0x28, 0x63, 0x74, 0x2b, 0x2b, 0x2c, 0x33, 0x29,
+  0x3b, 0x21, 0x53, 0x2e, 0x5f, 0x5f, 0x73, 0x26, 0x26, 0x52, 0x74, 0x28,
+  0x65, 0x2e, 0x5f, 0x5f, 0x48, 0x2c, 0x6e, 0x29, 0x26, 0x26, 0x28, 0x65,
+  0x2e, 0x5f, 0x5f, 0x3d, 0x74, 0x2c, 0x65, 0x2e, 0x69, 0x3d, 0x6e, 0x2c,
+  0x68, 0x74, 0x2e, 0x5f, 0x5f, 0x48, 0x2e, 0x5f, 0x5f, 0x68, 0x2e, 0x70,
+  0x75, 0x73, 0x68, 0x28, 0x65, 0x29, 0x29, 0x7d, 0x66, 0x75, 0x6e, 0x63,
+  0x74, 0x69, 0x6f, 0x6e, 0x20, 0x55, 0x74, 0x28, 0x74, 0x2c, 0x6e, 0x29,
+  0x7b, 0x76, 0x61, 0x72, 0x20, 0x65, 0x3d, 0x78, 0x74, 0x28, 0x63, 0x74,
+  0x2b, 0x2b, 0x2c, 0x34, 0x29, 0x3b, 0x21, 0x53, 0x2e, 0x5f, 0x5f, 0x73,
+  0x26, 0x26, 0x52, 0x74, 0x28, 0x65, 0x2e, 0x5f, 0x5f, 0x48, 0x2c, 0x6e,
+  0x29, 0x26, 0x26, 0x28, 0x65, 0x2e, 0x5f, 0x5f, 0x3d, 0x74, 0x2c, 0x65,
+  0x2e, 0x69, 0x3d, 0x6e, 0x2c, 0x68, 0x74, 0x2e, 0x5f, 0x5f, 0x68, 0x2e,
+  0x70, 0x75, 0x73, 0x68, 0x28, 0x65, 0x29, 0x29, 0x7d, 0x66, 0x75, 0x6e,
+  0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x48, 0x74, 0x28, 0x74, 0x29, 0x7b,
+  0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x64, 0x74, 0x3d, 0x35, 0x2c,
+  0x50, 0x74, 0x28, 0x28, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e,
+  0x28, 0x29, 0x7b, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x7b, 0x63, 0x75,
+  0x72, 0x72, 0x65, 0x6e, 0x74, 0x3a, 0x74, 0x7d, 0x7d, 0x29, 0x2c, 0x5b,
+  0x5d, 0x29, 0x7d, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20,
+  0x4e, 0x74, 0x28, 0x74, 0x2c, 0x6e, 0x2c, 0x65, 0x29, 0x7b, 0x64, 0x74,
+  0x3d, 0x36, 0x2c, 0x55, 0x74, 0x28, 0x28, 0x66, 0x75, 0x6e, 0x63, 0x74,
+  0x69, 0x6f, 0x6e, 0x28, 0x29, 0x7b, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e,
+  0x22, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x22, 0x3d, 0x3d,
+  0x74, 0x79, 0x70, 0x65, 0x6f, 0x66, 0x20, 0x74, 0x3f, 0x28, 0x74, 0x28,
+  0x6e, 0x28, 0x29, 0x29, 0x2c, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f,
+  0x6e, 0x28, 0x29, 0x7b, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x74,
+  0x28, 0x6e, 0x75, 0x6c, 0x6c, 0x29, 0x7d, 0x29, 0x3a, 0x74, 0x3f, 0x28,
+  0x74, 0x2e, 0x63, 0x75, 0x72, 0x72, 0x65, 0x6e, 0x74, 0x3d, 0x6e, 0x28,
+  0x29, 0x2c, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x29,
+  0x7b, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x74, 0x2e, 0x63, 0x75,
+  0x72, 0x72, 0x65, 0x6e, 0x74, 0x3d, 0x6e, 0x75, 0x6c, 0x6c, 0x7d, 0x29,
+  0x3a, 0x76, 0x6f, 0x69, 0x64, 0x20, 0x30, 0x7d, 0x29, 0x2c, 0x6e, 0x75,
+  0x6c, 0x6c, 0x3d, 0x3d, 0x65, 0x3f, 0x65, 0x3a, 0x65, 0x2e, 0x63, 0x6f,
+  0x6e, 0x63, 0x61, 0x74, 0x28, 0x74, 0x29, 0x29, 0x7d, 0x66, 0x75, 0x6e,
+  0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x50, 0x74, 0x28, 0x74, 0x2c, 0x6e,
+  0x29, 0x7b, 0x76, 0x61, 0x72, 0x20, 0x65, 0x3d, 0x78, 0x74, 0x28, 0x63,
+  0x74, 0x2b, 0x2b, 0x2c, 0x37, 0x29, 0x3b, 0x72, 0x65, 0x74, 0x75, 0x72,
+  0x6e, 0x20, 0x52, 0x74, 0x28, 0x65, 0x2e, 0x5f, 0x5f, 0x48, 0x2c, 0x6e,
+  0x29, 0x3f, 0x28, 0x65, 0x2e, 0x5f, 0x5f, 0x56, 0x3d, 0x74, 0x28, 0x29,
+  0x2c, 0x65, 0x2e, 0x69, 0x3d, 0x6e, 0x2c, 0x65, 0x2e, 0x5f, 0x5f, 0x68,
+  0x3d, 0x74, 0x2c, 0x65, 0x2e, 0x5f, 0x5f, 0x56, 0x29, 0x3a, 0x65, 0x2e,
+  0x5f, 0x5f, 0x7d, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20,
+  0x44, 0x74, 0x28, 0x74, 0x2c, 0x6e, 0x29, 0x7b, 0x72, 0x65, 0x74, 0x75,
+  0x72, 0x6e, 0x20, 0x64, 0x74, 0x3d, 0x38, 0x2c, 0x50, 0x74, 0x28, 0x28,
+  0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x29, 0x7b, 0x72,
+  0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x74, 0x7d, 0x29, 0x2c, 0x6e, 0x29,
+  0x7d, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x24, 0x74,
+  0x28, 0x74, 0x29, 0x7b, 0x76, 0x61, 0x72, 0x20, 0x6e, 0x3d, 0x68, 0x74,
+  0x2e, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x78, 0x74, 0x5b, 0x74, 0x2e, 0x5f,
+  0x5f, 0x63, 0x5d, 0x2c, 0x65, 0x3d, 0x78, 0x74, 0x28, 0x63, 0x74, 0x2b,
+  0x2b, 0x2c, 0x39, 0x29, 0x3b, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20,
+  0x65, 0x2e, 0x63, 0x3d, 0x74, 0x2c, 0x6e, 0x3f, 0x28, 0x6e, 0x75, 0x6c,
+  0x6c, 0x3d, 0x3d, 0x65, 0x2e, 0x5f, 0x5f, 0x26, 0x26, 0x28, 0x65, 0x2e,
+  0x5f, 0x5f, 0x3d, 0x21, 0x30, 0x2c, 0x6e, 0x2e, 0x73, 0x75, 0x62, 0x28,
+  0x68, 0x74, 0x29, 0x29, 0x2c, 0x6e, 0x2e, 0x70, 0x72, 0x6f, 0x70, 0x73,
+  0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x29, 0x3a, 0x74, 0x2e, 0x5f, 0x5f,
+  0x7d, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x54, 0x74,
+  0x28, 0x74, 0x2c, 0x6e, 0x29, 0x7b, 0x53, 0x2e, 0x75, 0x73, 0x65, 0x44,
+  0x65, 0x62, 0x75, 0x67, 0x56, 0x61, 0x6c, 0x75, 0x65, 0x26, 0x26, 0x53,
+  0x2e, 0x75, 0x73, 0x65, 0x44, 0x65, 0x62, 0x75, 0x67, 0x56, 0x61, 0x6c,
+  0x75, 0x65, 0x28, 0x6e, 0x3f, 0x6e, 0x28, 0x74, 0x29, 0x3a, 0x74, 0x29,
+  0x7d, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x56, 0x74,
+  0x28, 0x74, 0x29, 0x7b, 0x76, 0x61, 0x72, 0x20, 0x6e, 0x3d, 0x78, 0x74,
+  0x28, 0x63, 0x74, 0x2b, 0x2b, 0x2c, 0x31, 0x30, 0x29, 0x2c, 0x65, 0x3d,
+  0x77, 0x74, 0x28, 0x29, 0x3b, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20,
+  0x6e, 0x2e, 0x5f, 0x5f, 0x3d, 0x74, 0x2c, 0x68, 0x74, 0x2e, 0x63, 0x6f,
+  0x6d, 0x70, 0x6f, 0x6e, 0x65, 0x6e, 0x74, 0x44, 0x69, 0x64, 0x43, 0x61,
+  0x74, 0x63, 0x68, 0x7c, 0x7c, 0x28, 0x68, 0x74, 0x2e, 0x63, 0x6f, 0x6d,
+  0x70, 0x6f, 0x6e, 0x65, 0x6e, 0x74, 0x44, 0x69, 0x64, 0x43, 0x61, 0x74,
+  0x63, 0x68, 0x3d, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x28,
+  0x74, 0x2c, 0x69, 0x29, 0x7b, 0x6e, 0x2e, 0x5f, 0x5f, 0x26, 0x26, 0x6e,
+  0x2e, 0x5f, 0x5f, 0x28, 0x74, 0x2c, 0x69, 0x29, 0x2c, 0x65, 0x5b, 0x31,
+  0x5d, 0x28, 0x74, 0x29, 0x7d, 0x29, 0x2c, 0x5b, 0x65, 0x5b, 0x30, 0x5d,
+  0x2c, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x29, 0x7b,
+  0x65, 0x5b, 0x31, 0x5d, 0x28, 0x76, 0x6f, 0x69, 0x64, 0x20, 0x30, 0x29,
+  0x7d, 0x5d, 0x7d, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20,
+  0x41, 0x74, 0x28, 0x29, 0x7b, 0x76, 0x61, 0x72, 0x20, 0x74, 0x3d, 0x78,
+  0x74, 0x28, 0x63, 0x74, 0x2b, 0x2b, 0x2c, 0x31, 0x31, 0x29, 0x3b, 0x69,
+  0x66, 0x28, 0x21, 0x74, 0x2e, 0x5f, 0x5f, 0x29, 0x7b, 0x66, 0x6f, 0x72,
+  0x28, 0x76, 0x61, 0x72, 0x20, 0x6e, 0x3d, 0x68, 0x74, 0x2e, 0x5f, 0x5f,
+  0x76, 0x3b, 0x6e, 0x75, 0x6c, 0x6c, 0x21, 0x3d, 0x3d, 0x6e, 0x26, 0x26,
+  0x21, 0x6e, 0x2e, 0x5f, 0x5f, 0x6d, 0x26, 0x26, 0x6e, 0x75, 0x6c, 0x6c,
+  0x21, 0x3d, 0x3d, 0x6e, 0x2e, 0x5f, 0x5f, 0x3b, 0x29, 0x6e, 0x3d, 0x6e,
+  0x2e, 0x5f, 0x5f, 0x3b, 0x76, 0x61, 0x72, 0x20, 0x65, 0x3d, 0x6e, 0x2e,
+  0x5f, 0x5f, 0x6d, 0x7c, 0x7c, 0x28, 0x6e, 0x2e, 0x5f, 0x5f, 0x6d, 0x3d,
+  0x5b, 0x30, 0x2c, 0x30, 0x5d, 0x29, 0x3b, 0x74, 0x2e, 0x5f, 0x5f, 0x3d,
+  0x22, 0x50, 0x22, 0x2b, 0x65, 0x5b, 0x30, 0x5d, 0x2b, 0x22, 0x2d, 0x22,
+  0x2b, 0x65, 0x5b, 0x31, 0x5d, 0x2b, 0x2b, 0x7d, 0x72, 0x65, 0x74, 0x75,
+  0x72, 0x6e, 0x20, 0x74, 0x2e, 0x5f, 0x5f, 0x7d, 0x66, 0x75, 0x6e, 0x63,
+  0x74, 0x69, 0x6f, 0x6e, 0x20, 0x46, 0x74, 0x28, 0x29, 0x7b, 0x66, 0x6f,
+  0x72, 0x28, 0x76, 0x61, 0x72, 0x20, 0x74, 0x3b, 0x74, 0x3d, 0x76, 0x74,
+  0x2e, 0x73, 0x68, 0x69, 0x66, 0x74, 0x28, 0x29, 0x3b, 0x29, 0x69, 0x66,
+  0x28, 0x74, 0x2e, 0x5f, 0x5f, 0x50, 0x26, 0x26, 0x74, 0x2e, 0x5f, 0x5f,
+  0x48, 0x29, 0x74, 0x72, 0x79, 0x7b, 0x74, 0x2e, 0x5f, 0x5f, 0x48, 0x2e,
+  0x5f, 0x5f, 0x68, 0x2e, 0x66, 0x6f, 0x72, 0x45, 0x61, 0x63, 0x68, 0x28,
+  0x4f, 0x74, 0x29, 0x2c, 0x74, 0x2e, 0x5f, 0x5f, 0x48, 0x2e, 0x5f, 0x5f,
+  0x68, 0x2e, 0x66, 0x6f, 0x72, 0x45, 0x61, 0x63, 0x68, 0x28, 0x4c, 0x74,
+  0x29, 0x2c, 0x74, 0x2e, 0x5f, 0x5f, 0x48, 0x2e, 0x5f, 0x5f, 0x68, 0x3d,
+  0x5b, 0x5d, 0x7d, 0x63, 0x61, 0x74, 0x63, 0x68, 0x28, 0x75, 0x29, 0x7b,
+  0x74, 0x2e, 0x5f, 0x5f, 0x48, 0x2e, 0x5f, 0x5f, 0x68, 0x3d, 0x5b, 0x5d,
+  0x2c, 0x53, 0x2e, 0x5f, 0x5f, 0x65, 0x28, 0x75, 0x2c, 0x74, 0x2e, 0x5f,
+  0x5f, 0x76, 0x29, 0x7d, 0x7d, 0x53, 0x2e, 0x5f, 0x5f, 0x62, 0x3d, 0x66,
+  0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x74, 0x29, 0x7b, 0x68,
+  0x74, 0x3d, 0x6e, 0x75, 0x6c, 0x6c, 0x2c, 0x6d, 0x74, 0x26, 0x26, 0x6d,
+  0x74, 0x28, 0x74, 0x29, 0x7d, 0x2c, 0x53, 0x2e, 0x5f, 0x5f, 0x72, 0x3d,
+  0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x74, 0x29, 0x7b,
+  0x67, 0x74, 0x26, 0x26, 0x67, 0x74, 0x28, 0x74, 0x29, 0x2c, 0x63, 0x74,
+  0x3d, 0x30, 0x3b, 0x76, 0x61, 0x72, 0x20, 0x6e, 0x3d, 0x28, 0x68, 0x74,
+  0x3d, 0x74, 0x2e, 0x5f, 0x5f, 0x63, 0x29, 0x2e, 0x5f, 0x5f, 0x48, 0x3b,
+  0x6e, 0x26, 0x26, 0x28, 0x61, 0x74, 0x3d, 0x3d, 0x3d, 0x68, 0x74, 0x3f,
+  0x28, 0x6e, 0x2e, 0x5f, 0x5f, 0x68, 0x3d, 0x5b, 0x5d, 0x2c, 0x68, 0x74,
+  0x2e, 0x5f, 0x5f, 0x68, 0x3d, 0x5b, 0x5d, 0x2c, 0x6e, 0x2e, 0x5f, 0x5f,
+  0x2e, 0x66, 0x6f, 0x72, 0x45, 0x61, 0x63, 0x68, 0x28, 0x28, 0x66, 0x75,
+  0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x74, 0x29, 0x7b, 0x74, 0x2e,
+  0x5f, 0x5f, 0x4e, 0x26, 0x26, 0x28, 0x74, 0x2e, 0x5f, 0x5f, 0x3d, 0x74,
+  0x2e, 0x5f, 0x5f, 0x4e, 0x29, 0x2c, 0x74, 0x2e, 0x5f, 0x5f, 0x56, 0x3d,
+  0x79, 0x74, 0x2c, 0x74, 0x2e, 0x5f, 0x5f, 0x4e, 0x3d, 0x74, 0x2e, 0x69,
+  0x3d, 0x76, 0x6f, 0x69, 0x64, 0x20, 0x30, 0x7d, 0x29, 0x29, 0x29, 0x3a,
+  0x28, 0x6e, 0x2e, 0x5f, 0x5f, 0x68, 0x2e, 0x66, 0x6f, 0x72, 0x45, 0x61,
+  0x63, 0x68, 0x28, 0x4f, 0x74, 0x29, 0x2c, 0x6e, 0x2e, 0x5f, 0x5f, 0x68,
+  0x2e, 0x66, 0x6f, 0x72, 0x45, 0x61, 0x63, 0x68, 0x28, 0x4c, 0x74, 0x29,
+  0x2c, 0x6e, 0x2e, 0x5f, 0x5f, 0x68, 0x3d, 0x5b, 0x5d, 0x2c, 0x63, 0x74,
+  0x3d, 0x30, 0x29, 0x29, 0x2c, 0x61, 0x74, 0x3d, 0x68, 0x74, 0x7d, 0x2c,
+  0x53, 0x2e, 0x64, 0x69, 0x66, 0x66, 0x65, 0x64, 0x3d, 0x66, 0x75, 0x6e,
+  0x63, 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x74, 0x29, 0x7b, 0x62, 0x74, 0x26,
+  0x26, 0x62, 0x74, 0x28, 0x74, 0x29, 0x3b, 0x76, 0x61, 0x72, 0x20, 0x6e,
+  0x3d, 0x74, 0x2e, 0x5f, 0x5f, 0x63, 0x3b, 0x6e, 0x26, 0x26, 0x6e, 0x2e,
+  0x5f, 0x5f, 0x48, 0x26, 0x26, 0x28, 0x6e, 0x2e, 0x5f, 0x5f, 0x48, 0x2e,
+  0x5f, 0x5f, 0x68, 0x2e, 0x6c, 0x65, 0x6e, 0x67, 0x74, 0x68, 0x26, 0x26,
+  0x28, 0x31, 0x21, 0x3d, 0x3d, 0x76, 0x74, 0x2e, 0x70, 0x75, 0x73, 0x68,
+  0x28, 0x6e, 0x29, 0x26, 0x26, 0x70, 0x74, 0x3d, 0x3d, 0x3d, 0x53, 0x2e,
+  0x72, 0x65, 0x71, 0x75, 0x65, 0x73, 0x74, 0x41, 0x6e, 0x69, 0x6d, 0x61,
+  0x74, 0x69, 0x6f, 0x6e, 0x46, 0x72, 0x61, 0x6d, 0x65, 0x7c, 0x7c, 0x28,
+  0x28, 0x70, 0x74, 0x3d, 0x53, 0x2e, 0x72, 0x65, 0x71, 0x75, 0x65, 0x73,
+  0x74, 0x41, 0x6e, 0x69, 0x6d, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x46, 0x72,
+  0x61, 0x6d, 0x65, 0x29, 0x7c, 0x7c, 0x57, 0x74, 0x29, 0x28, 0x46, 0x74,
+  0x29, 0x29, 0x2c, 0x6e, 0x2e, 0x5f, 0x5f, 0x48, 0x2e, 0x5f, 0x5f, 0x2e,
+  0x66, 0x6f, 0x72, 0x45, 0x61, 0x63, 0x68, 0x28, 0x28, 0x66, 0x75, 0x6e,
+  0x63, 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x74, 0x29, 0x7b, 0x74, 0x2e, 0x69,
+  0x26, 0x26, 0x28, 0x74, 0x2e, 0x5f, 0x5f, 0x48, 0x3d, 0x74, 0x2e, 0x69,
+  0x29, 0x2c, 0x74, 0x2e, 0x5f, 0x5f, 0x56, 0x21, 0x3d, 0x3d, 0x79, 0x74,
+  0x26, 0x26, 0x28, 0x74, 0x2e, 0x5f, 0x5f, 0x3d, 0x74, 0x2e, 0x5f, 0x5f,
+  0x56, 0x29, 0x2c, 0x74, 0x2e, 0x69, 0x3d, 0x76, 0x6f, 0x69, 0x64, 0x20,
+  0x30, 0x2c, 0x74, 0x2e, 0x5f, 0x5f, 0x56, 0x3d, 0x79, 0x74, 0x7d, 0x29,
+  0x29, 0x29, 0x2c, 0x61, 0x74, 0x3d, 0x68, 0x74, 0x3d, 0x6e, 0x75, 0x6c,
+  0x6c, 0x7d, 0x2c, 0x53, 0x2e, 0x5f, 0x5f, 0x63, 0x3d, 0x66, 0x75, 0x6e,
+  0x63, 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x74, 0x2c, 0x6e, 0x29, 0x7b, 0x6e,
+  0x2e, 0x73, 0x6f, 0x6d, 0x65, 0x28, 0x28, 0x66, 0x75, 0x6e, 0x63, 0x74,
+  0x69, 0x6f, 0x6e, 0x28, 0x74, 0x29, 0x7b, 0x74, 0x72, 0x79, 0x7b, 0x74,
+  0x2e, 0x5f, 0x5f, 0x68, 0x2e, 0x66, 0x6f, 0x72, 0x45, 0x61, 0x63, 0x68,
+  0x28, 0x4f, 0x74, 0x29, 0x2c, 0x74, 0x2e, 0x5f, 0x5f, 0x68, 0x3d, 0x74,
+  0x2e, 0x5f, 0x5f, 0x68, 0x2e, 0x66, 0x69, 0x6c, 0x74, 0x65, 0x72, 0x28,
+  0x28, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x74, 0x29,
+  0x7b, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x21, 0x74, 0x2e, 0x5f, 0x5f,
+  0x7c, 0x7c, 0x4c, 0x74, 0x28, 0x74, 0x29, 0x7d, 0x29, 0x29, 0x7d, 0x63,
+  0x61, 0x74, 0x63, 0x68, 0x28, 0x73, 0x29, 0x7b, 0x6e, 0x2e, 0x73, 0x6f,
+  0x6d, 0x65, 0x28, 0x28, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e,
+  0x28, 0x74, 0x29, 0x7b, 0x74, 0x2e, 0x5f, 0x5f, 0x68, 0x26, 0x26, 0x28,
+  0x74, 0x2e, 0x5f, 0x5f, 0x68, 0x3d, 0x5b, 0x5d, 0x29, 0x7d, 0x29, 0x29,
+  0x2c, 0x6e, 0x3d, 0x5b, 0x5d, 0x2c, 0x53, 0x2e, 0x5f, 0x5f, 0x65, 0x28,
+  0x73, 0x2c, 0x74, 0x2e, 0x5f, 0x5f, 0x76, 0x29, 0x7d, 0x7d, 0x29, 0x29,
+  0x2c, 0x6b, 0x74, 0x26, 0x26, 0x6b, 0x74, 0x28, 0x74, 0x2c, 0x6e, 0x29,
+  0x7d, 0x2c, 0x53, 0x2e, 0x75, 0x6e, 0x6d, 0x6f, 0x75, 0x6e, 0x74, 0x3d,
+  0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x74, 0x29, 0x7b,
+  0x53, 0x74, 0x26, 0x26, 0x53, 0x74, 0x28, 0x74, 0x29, 0x3b, 0x76, 0x61,
+  0x72, 0x20, 0x6e, 0x2c, 0x65, 0x3d, 0x74, 0x2e, 0x5f, 0x5f, 0x63, 0x3b,
+  0x65, 0x26, 0x26, 0x65, 0x2e, 0x5f, 0x5f, 0x48, 0x26, 0x26, 0x28, 0x65,
+  0x2e, 0x5f, 0x5f, 0x48, 0x2e, 0x5f, 0x5f, 0x2e, 0x66, 0x6f, 0x72, 0x45,
+  0x61, 0x63, 0x68, 0x28, 0x28, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f,
+  0x6e, 0x28, 0x74, 0x29, 0x7b, 0x74, 0x72, 0x79, 0x7b, 0x4f, 0x74, 0x28,
+  0x74, 0x29, 0x7d, 0x63, 0x61, 0x74, 0x63, 0x68, 0x28, 0x74, 0x29, 0x7b,
+  0x6e, 0x3d, 0x74, 0x7d, 0x7d, 0x29, 0x29, 0x2c, 0x65, 0x2e, 0x5f, 0x5f,
+  0x48, 0x3d, 0x76, 0x6f, 0x69, 0x64, 0x20, 0x30, 0x2c, 0x6e, 0x26, 0x26,
+  0x53, 0x2e, 0x5f, 0x5f, 0x65, 0x28, 0x6e, 0x2c, 0x65, 0x2e, 0x5f, 0x5f,
+  0x76, 0x29, 0x29, 0x7d, 0x3b, 0x76, 0x61, 0x72, 0x20, 0x4d, 0x74, 0x3d,
+  0x22, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x22, 0x3d, 0x3d,
+  0x74, 0x79, 0x70, 0x65, 0x6f, 0x66, 0x20, 0x72, 0x65, 0x71, 0x75, 0x65,
+  0x73, 0x74, 0x41, 0x6e, 0x69, 0x6d, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x46,
+  0x72, 0x61, 0x6d, 0x65, 0x3b, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f,
+  0x6e, 0x20, 0x57, 0x74, 0x28, 0x74, 0x29, 0x7b, 0x76, 0x61, 0x72, 0x20,
+  0x6e, 0x2c, 0x65, 0x3d, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e,
+  0x28, 0x29, 0x7b, 0x63, 0x6c, 0x65, 0x61, 0x72, 0x54, 0x69, 0x6d, 0x65,
+  0x6f, 0x75, 0x74, 0x28, 0x69, 0x29, 0x2c, 0x4d, 0x74, 0x26, 0x26, 0x63,
+  0x61, 0x6e, 0x63, 0x65, 0x6c, 0x41, 0x6e, 0x69, 0x6d, 0x61, 0x74, 0x69,
+  0x6f, 0x6e, 0x46, 0x72, 0x61, 0x6d, 0x65, 0x28, 0x6e, 0x29, 0x2c, 0x73,
+  0x65, 0x74, 0x54, 0x69, 0x6d, 0x65, 0x6f, 0x75, 0x74, 0x28, 0x74, 0x29,
+  0x7d, 0x2c, 0x69, 0x3d, 0x73, 0x65, 0x74, 0x54, 0x69, 0x6d, 0x65, 0x6f,
+  0x75, 0x74, 0x28, 0x65, 0x2c, 0x31, 0x30, 0x30, 0x29, 0x3b, 0x4d, 0x74,
+  0x26, 0x26, 0x28, 0x6e, 0x3d, 0x72, 0x65, 0x71, 0x75, 0x65, 0x73, 0x74,
+  0x41, 0x6e, 0x69, 0x6d, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x46, 0x72, 0x61,
+  0x6d, 0x65, 0x28, 0x65, 0x29, 0x29, 0x7d, 0x66, 0x75, 0x6e, 0x63, 0x74,
+  0x69, 0x6f, 0x6e, 0x20, 0x4f, 0x74, 0x28, 0x74, 0x29, 0x7b, 0x76, 0x61,
+  0x72, 0x20, 0x6e, 0x3d, 0x68, 0x74, 0x2c, 0x65, 0x3d, 0x74, 0x2e, 0x5f,
+  0x5f, 0x63, 0x3b, 0x22, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e,
+  0x22, 0x3d, 0x3d, 0x74, 0x79, 0x70, 0x65, 0x6f, 0x66, 0x20, 0x65, 0x26,
+  0x26, 0x28, 0x74, 0x2e, 0x5f, 0x5f, 0x63, 0x3d, 0x76, 0x6f, 0x69, 0x64,
+  0x20, 0x30, 0x2c, 0x65, 0x28, 0x29, 0x29, 0x2c, 0x68, 0x74, 0x3d, 0x6e,
+  0x7d, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x4c, 0x74,
+  0x28, 0x74, 0x29, 0x7b, 0x76, 0x61, 0x72, 0x20, 0x6e, 0x3d, 0x68, 0x74,
+  0x3b, 0x74, 0x2e, 0x5f, 0x5f, 0x63, 0x3d, 0x74, 0x2e, 0x5f, 0x5f, 0x28,
+  0x29, 0x2c, 0x68, 0x74, 0x3d, 0x6e, 0x7d, 0x66, 0x75, 0x6e, 0x63, 0x74,
+  0x69, 0x6f, 0x6e, 0x20, 0x52, 0x74, 0x28, 0x74, 0x2c, 0x6e, 0x29, 0x7b,
+  0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x21, 0x74, 0x7c, 0x7c, 0x74, 0x2e,
+  0x6c, 0x65, 0x6e, 0x67, 0x74, 0x68, 0x21, 0x3d, 0x3d, 0x6e, 0x2e, 0x6c,
+  0x65, 0x6e, 0x67, 0x74, 0x68, 0x7c, 0x7c, 0x6e, 0x2e, 0x73, 0x6f, 0x6d,
+  0x65, 0x28, 0x28, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x28,
+  0x6e, 0x2c, 0x65, 0x29, 0x7b, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20,
+  0x6e, 0x21, 0x3d, 0x3d, 0x74, 0x5b, 0x65, 0x5d, 0x7d, 0x29, 0x29, 0x7d,
+  0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x49, 0x74, 0x28,
+  0x74, 0x2c, 0x6e, 0x29, 0x7b, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x22,
+  0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x22, 0x3d, 0x3d, 0x74,
+  0x79, 0x70, 0x65, 0x6f, 0x66, 0x20, 0x6e, 0x3f, 0x6e, 0x28, 0x74, 0x29,
+  0x3a, 0x6e, 0x7d, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20,
+  0x6a, 0x74, 0x28, 0x74, 0x2c, 0x6e, 0x29, 0x7b, 0x53, 0x5b, 0x74, 0x5d,
+  0x3d, 0x6e, 0x2e, 0x62, 0x69, 0x6e, 0x64, 0x28, 0x6e, 0x75, 0x6c, 0x6c,
+  0x2c, 0x53, 0x5b, 0x74, 0x5d, 0x7c, 0x7c, 0x28, 0x28, 0x29, 0x3d, 0x3e,
+  0x7b, 0x7d, 0x29, 0x29, 0x7d, 0x6c, 0x65, 0x74, 0x20, 0x71, 0x74, 0x2c,
+  0x42, 0x74, 0x3b, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20,
+  0x47, 0x74, 0x28, 0x74, 0x29, 0x7b, 0x69, 0x66, 0x28, 0x42, 0x74, 0x29,
+  0x42, 0x74, 0x28, 0x29, 0x3b, 0x42, 0x74, 0x3d, 0x74, 0x26, 0x26, 0x74,
+  0x2e, 0x53, 0x28, 0x29, 0x7d, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f,
+  0x6e, 0x20, 0x7a, 0x74, 0x28, 0x7b, 0x64, 0x61, 0x74, 0x61, 0x3a, 0x74,
+  0x7d, 0x29, 0x7b, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x6e, 0x3d, 0x4b,
+  0x74, 0x28, 0x74, 0x29, 0x3b, 0x6e, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65,
+  0x3d, 0x74, 0x3b, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x65, 0x3d, 0x50,
+  0x74, 0x28, 0x28, 0x29, 0x3d, 0x3e, 0x7b, 0x6c, 0x65, 0x74, 0x20, 0x74,
+  0x3d, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x5f, 0x5f, 0x76, 0x3b, 0x77, 0x68,
+  0x69, 0x6c, 0x65, 0x28, 0x74, 0x3d, 0x74, 0x2e, 0x5f, 0x5f, 0x29, 0x69,
+  0x66, 0x28, 0x74, 0x2e, 0x5f, 0x5f, 0x63, 0x29, 0x7b, 0x74, 0x2e, 0x5f,
+  0x5f, 0x63, 0x2e, 0x5f, 0x5f, 0x24, 0x66, 0x7c, 0x3d, 0x34, 0x3b, 0x62,
+  0x72, 0x65, 0x61, 0x6b, 0x7d, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x5f, 0x5f,
+  0x24, 0x75, 0x2e, 0x63, 0x3d, 0x28, 0x29, 0x3d, 0x3e, 0x7b, 0x74, 0x68,
+  0x69, 0x73, 0x2e, 0x62, 0x61, 0x73, 0x65, 0x2e, 0x64, 0x61, 0x74, 0x61,
+  0x3d, 0x65, 0x2e, 0x70, 0x65, 0x65, 0x6b, 0x28, 0x29, 0x7d, 0x3b, 0x72,
+  0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x64, 0x28, 0x28, 0x29, 0x3d, 0x3e,
+  0x7b, 0x6c, 0x65, 0x74, 0x20, 0x74, 0x3d, 0x6e, 0x2e, 0x76, 0x61, 0x6c,
+  0x75, 0x65, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x3b, 0x72, 0x65, 0x74,
+  0x75, 0x72, 0x6e, 0x20, 0x30, 0x3d, 0x3d, 0x3d, 0x74, 0x3f, 0x30, 0x3a,
+  0x21, 0x30, 0x3d, 0x3d, 0x3d, 0x74, 0x3f, 0x22, 0x22, 0x3a, 0x74, 0x7c,
+  0x7c, 0x22, 0x22, 0x7d, 0x29, 0x7d, 0x2c, 0x5b, 0x5d, 0x29, 0x3b, 0x72,
+  0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x65, 0x2e, 0x76, 0x61, 0x6c, 0x75,
+  0x65, 0x7d, 0x7a, 0x74, 0x2e, 0x64, 0x69, 0x73, 0x70, 0x6c, 0x61, 0x79,
+  0x4e, 0x61, 0x6d, 0x65, 0x3d, 0x22, 0x5f, 0x73, 0x74, 0x22, 0x3b, 0x4f,
+  0x62, 0x6a, 0x65, 0x63, 0x74, 0x2e, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65,
+  0x50, 0x72, 0x6f, 0x70, 0x65, 0x72, 0x74, 0x69, 0x65, 0x73, 0x28, 0x66,
+  0x2e, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x74, 0x79, 0x70, 0x65, 0x2c, 0x7b,
+  0x63, 0x6f, 0x6e, 0x73, 0x74, 0x72, 0x75, 0x63, 0x74, 0x6f, 0x72, 0x3a,
+  0x7b, 0x63, 0x6f, 0x6e, 0x66, 0x69, 0x67, 0x75, 0x72, 0x61, 0x62, 0x6c,
+  0x65, 0x3a, 0x21, 0x30, 0x2c, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x3a, 0x76,
+  0x6f, 0x69, 0x64, 0x20, 0x30, 0x7d, 0x2c, 0x74, 0x79, 0x70, 0x65, 0x3a,
+  0x7b, 0x63, 0x6f, 0x6e, 0x66, 0x69, 0x67, 0x75, 0x72, 0x61, 0x62, 0x6c,
+  0x65, 0x3a, 0x21, 0x30, 0x2c, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x3a, 0x7a,
+  0x74, 0x7d, 0x2c, 0x70, 0x72, 0x6f, 0x70, 0x73, 0x3a, 0x7b, 0x63, 0x6f,
+  0x6e, 0x66, 0x69, 0x67, 0x75, 0x72, 0x61, 0x62, 0x6c, 0x65, 0x3a, 0x21,
+  0x30, 0x2c, 0x67, 0x65, 0x74, 0x28, 0x29, 0x7b, 0x72, 0x65, 0x74, 0x75,
+  0x72, 0x6e, 0x7b, 0x64, 0x61, 0x74, 0x61, 0x3a, 0x74, 0x68, 0x69, 0x73,
+  0x7d, 0x7d, 0x7d, 0x2c, 0x5f, 0x5f, 0x62, 0x3a, 0x7b, 0x63, 0x6f, 0x6e,
+  0x66, 0x69, 0x67, 0x75, 0x72, 0x61, 0x62, 0x6c, 0x65, 0x3a, 0x21, 0x30,
+  0x2c, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x3a, 0x31, 0x7d, 0x7d, 0x29, 0x3b,
+  0x6a, 0x74, 0x28, 0x22, 0x5f, 0x5f, 0x62, 0x22, 0x2c, 0x28, 0x74, 0x2c,
+  0x6e, 0x29, 0x3d, 0x3e, 0x7b, 0x69, 0x66, 0x28, 0x22, 0x73, 0x74, 0x72,
+  0x69, 0x6e, 0x67, 0x22, 0x3d, 0x3d, 0x74, 0x79, 0x70, 0x65, 0x6f, 0x66,
+  0x20, 0x6e, 0x2e, 0x74, 0x79, 0x70, 0x65, 0x29, 0x7b, 0x6c, 0x65, 0x74,
+  0x20, 0x74, 0x2c, 0x65, 0x3d, 0x6e, 0x2e, 0x70, 0x72, 0x6f, 0x70, 0x73,
+  0x3b, 0x66, 0x6f, 0x72, 0x28, 0x6c, 0x65, 0x74, 0x20, 0x69, 0x20, 0x69,
+  0x6e, 0x20, 0x65, 0x29, 0x7b, 0x69, 0x66, 0x28, 0x22, 0x63, 0x68, 0x69,
+  0x6c, 0x64, 0x72, 0x65, 0x6e, 0x22, 0x3d, 0x3d, 0x3d, 0x69, 0x29, 0x63,
+  0x6f, 0x6e, 0x74, 0x69, 0x6e, 0x75, 0x65, 0x3b, 0x6c, 0x65, 0x74, 0x20,
+  0x5f, 0x3d, 0x65, 0x5b, 0x69, 0x5d, 0x3b, 0x69, 0x66, 0x28, 0x5f, 0x20,
+  0x69, 0x6e, 0x73, 0x74, 0x61, 0x6e, 0x63, 0x65, 0x6f, 0x66, 0x20, 0x66,
+  0x29, 0x7b, 0x69, 0x66, 0x28, 0x21, 0x74, 0x29, 0x6e, 0x2e, 0x5f, 0x5f,
+  0x6e, 0x70, 0x3d, 0x74, 0x3d, 0x7b, 0x7d, 0x3b, 0x74, 0x5b, 0x69, 0x5d,
+  0x3d, 0x5f, 0x3b, 0x65, 0x5b, 0x69, 0x5d, 0x3d, 0x5f, 0x2e, 0x70, 0x65,
+  0x65, 0x6b, 0x28, 0x29, 0x7d, 0x7d, 0x7d, 0x74, 0x28, 0x6e, 0x29, 0x7d,
+  0x29, 0x3b, 0x6a, 0x74, 0x28, 0x22, 0x5f, 0x5f, 0x72, 0x22, 0x2c, 0x28,
+  0x74, 0x2c, 0x6e, 0x29, 0x3d, 0x3e, 0x7b, 0x47, 0x74, 0x28, 0x29, 0x3b,
+  0x6c, 0x65, 0x74, 0x20, 0x65, 0x2c, 0x69, 0x3d, 0x6e, 0x2e, 0x5f, 0x5f,
+  0x63, 0x3b, 0x69, 0x66, 0x28, 0x69, 0x29, 0x7b, 0x69, 0x2e, 0x5f, 0x5f,
+  0x24, 0x66, 0x26, 0x3d, 0x2d, 0x32, 0x3b, 0x65, 0x3d, 0x69, 0x2e, 0x5f,
+  0x5f, 0x24, 0x75, 0x3b, 0x69, 0x66, 0x28, 0x76, 0x6f, 0x69, 0x64, 0x20,
+  0x30, 0x3d, 0x3d, 0x3d, 0x65, 0x29, 0x69, 0x2e, 0x5f, 0x5f, 0x24, 0x75,
+  0x3d, 0x65, 0x3d, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x28,
+  0x74, 0x29, 0x7b, 0x6c, 0x65, 0x74, 0x20, 0x6e, 0x3b, 0x62, 0x28, 0x28,
+  0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x29, 0x7b, 0x6e,
+  0x3d, 0x74, 0x68, 0x69, 0x73, 0x7d, 0x29, 0x29, 0x3b, 0x6e, 0x2e, 0x63,
+  0x3d, 0x28, 0x29, 0x3d, 0x3e, 0x7b, 0x69, 0x2e, 0x5f, 0x5f, 0x24, 0x66,
+  0x7c, 0x3d, 0x31, 0x3b, 0x69, 0x2e, 0x73, 0x65, 0x74, 0x53, 0x74, 0x61,
+  0x74, 0x65, 0x28, 0x7b, 0x7d, 0x29, 0x7d, 0x3b, 0x72, 0x65, 0x74, 0x75,
+  0x72, 0x6e, 0x20, 0x6e, 0x7d, 0x28, 0x29, 0x7d, 0x71, 0x74, 0x3d, 0x69,
+  0x3b, 0x47, 0x74, 0x28, 0x65, 0x29, 0x3b, 0x74, 0x28, 0x6e, 0x29, 0x7d,
+  0x29, 0x3b, 0x6a, 0x74, 0x28, 0x22, 0x5f, 0x5f, 0x65, 0x22, 0x2c, 0x28,
+  0x74, 0x2c, 0x6e, 0x2c, 0x65, 0x2c, 0x69, 0x29, 0x3d, 0x3e, 0x7b, 0x47,
+  0x74, 0x28, 0x29, 0x3b, 0x71, 0x74, 0x3d, 0x76, 0x6f, 0x69, 0x64, 0x20,
+  0x30, 0x3b, 0x74, 0x28, 0x6e, 0x2c, 0x65, 0x2c, 0x69, 0x29, 0x7d, 0x29,
+  0x3b, 0x6a, 0x74, 0x28, 0x22, 0x64, 0x69, 0x66, 0x66, 0x65, 0x64, 0x22,
+  0x2c, 0x28, 0x74, 0x2c, 0x6e, 0x29, 0x3d, 0x3e, 0x7b, 0x47, 0x74, 0x28,
+  0x29, 0x3b, 0x71, 0x74, 0x3d, 0x76, 0x6f, 0x69, 0x64, 0x20, 0x30, 0x3b,
+  0x6c, 0x65, 0x74, 0x20, 0x65, 0x3b, 0x69, 0x66, 0x28, 0x22, 0x73, 0x74,
+  0x72, 0x69, 0x6e, 0x67, 0x22, 0x3d, 0x3d, 0x74, 0x79, 0x70, 0x65, 0x6f,
+  0x66, 0x20, 0x6e, 0x2e, 0x74, 0x79, 0x70, 0x65, 0x26, 0x26, 0x28, 0x65,
+  0x3d, 0x6e, 0x2e, 0x5f, 0x5f, 0x65, 0x29, 0x29, 0x7b, 0x6c, 0x65, 0x74,
+  0x20, 0x74, 0x3d, 0x6e, 0x2e, 0x5f, 0x5f, 0x6e, 0x70, 0x2c, 0x69, 0x3d,
+  0x6e, 0x2e, 0x70, 0x72, 0x6f, 0x70, 0x73, 0x3b, 0x69, 0x66, 0x28, 0x74,
+  0x29, 0x7b, 0x6c, 0x65, 0x74, 0x20, 0x6e, 0x3d, 0x65, 0x2e, 0x55, 0x3b,
+  0x69, 0x66, 0x28, 0x6e, 0x29, 0x66, 0x6f, 0x72, 0x28, 0x6c, 0x65, 0x74,
+  0x20, 0x65, 0x20, 0x69, 0x6e, 0x20, 0x6e, 0x29, 0x7b, 0x6c, 0x65, 0x74,
+  0x20, 0x69, 0x3d, 0x6e, 0x5b, 0x65, 0x5d, 0x3b, 0x69, 0x66, 0x28, 0x76,
+  0x6f, 0x69, 0x64, 0x20, 0x30, 0x21, 0x3d, 0x3d, 0x69, 0x26, 0x26, 0x21,
+  0x28, 0x65, 0x20, 0x69, 0x6e, 0x20, 0x74, 0x29, 0x29, 0x7b, 0x69, 0x2e,
+  0x64, 0x28, 0x29, 0x3b, 0x6e, 0x5b, 0x65, 0x5d, 0x3d, 0x76, 0x6f, 0x69,
+  0x64, 0x20, 0x30, 0x7d, 0x7d, 0x65, 0x6c, 0x73, 0x65, 0x7b, 0x6e, 0x3d,
+  0x7b, 0x7d, 0x3b, 0x65, 0x2e, 0x55, 0x3d, 0x6e, 0x7d, 0x66, 0x6f, 0x72,
+  0x28, 0x6c, 0x65, 0x74, 0x20, 0x5f, 0x20, 0x69, 0x6e, 0x20, 0x74, 0x29,
+  0x7b, 0x6c, 0x65, 0x74, 0x20, 0x6f, 0x3d, 0x6e, 0x5b, 0x5f, 0x5d, 0x2c,
+  0x72, 0x3d, 0x74, 0x5b, 0x5f, 0x5d, 0x3b, 0x69, 0x66, 0x28, 0x76, 0x6f,
+  0x69, 0x64, 0x20, 0x30, 0x3d, 0x3d, 0x3d, 0x6f, 0x29, 0x7b, 0x6f, 0x3d,
+  0x4a, 0x74, 0x28, 0x65, 0x2c, 0x5f, 0x2c, 0x72, 0x2c, 0x69, 0x29, 0x3b,
+  0x6e, 0x5b, 0x5f, 0x5d, 0x3d, 0x6f, 0x7d, 0x65, 0x6c, 0x73, 0x65, 0x20,
+  0x6f, 0x2e, 0x6f, 0x28, 0x72, 0x2c, 0x69, 0x29, 0x7d, 0x7d, 0x7d, 0x74,
+  0x28, 0x6e, 0x29, 0x7d, 0x29, 0x3b, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69,
+  0x6f, 0x6e, 0x20, 0x4a, 0x74, 0x28, 0x74, 0x2c, 0x6e, 0x2c, 0x65, 0x2c,
+  0x69, 0x29, 0x7b, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x5f, 0x3d, 0x6e,
+  0x20, 0x69, 0x6e, 0x20, 0x74, 0x26, 0x26, 0x76, 0x6f, 0x69, 0x64, 0x20,
+  0x30, 0x3d, 0x3d, 0x3d, 0x74, 0x2e, 0x6f, 0x77, 0x6e, 0x65, 0x72, 0x53,
+  0x56, 0x47, 0x45, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x2c, 0x6f, 0x3d,
+  0x73, 0x28, 0x65, 0x29, 0x3b, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x7b,
+  0x6f, 0x3a, 0x28, 0x74, 0x2c, 0x6e, 0x29, 0x3d, 0x3e, 0x7b, 0x6f, 0x2e,
+  0x76, 0x61, 0x6c, 0x75, 0x65, 0x3d, 0x74, 0x3b, 0x69, 0x3d, 0x6e, 0x7d,
+  0x2c, 0x64, 0x3a, 0x62, 0x28, 0x28, 0x29, 0x3d, 0x3e, 0x7b, 0x63, 0x6f,
+  0x6e, 0x73, 0x74, 0x20, 0x65, 0x3d, 0x6f, 0x2e, 0x76, 0x61, 0x6c, 0x75,
+  0x65, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x3b, 0x69, 0x66, 0x28, 0x69,
+  0x5b, 0x6e, 0x5d, 0x21, 0x3d, 0x3d, 0x65, 0x29, 0x7b, 0x69, 0x5b, 0x6e,
+  0x5d, 0x3d, 0x65, 0x3b, 0x69, 0x66, 0x28, 0x5f, 0x29, 0x74, 0x5b, 0x6e,
+  0x5d, 0x3d, 0x65, 0x3b, 0x65, 0x6c, 0x73, 0x65, 0x20, 0x69, 0x66, 0x28,
+  0x65, 0x29, 0x74, 0x2e, 0x73, 0x65, 0x74, 0x41, 0x74, 0x74, 0x72, 0x69,
+  0x62, 0x75, 0x74, 0x65, 0x28, 0x6e, 0x2c, 0x65, 0x29, 0x3b, 0x65, 0x6c,
+  0x73, 0x65, 0x20, 0x74, 0x2e, 0x72, 0x65, 0x6d, 0x6f, 0x76, 0x65, 0x41,
+  0x74, 0x74, 0x72, 0x69, 0x62, 0x75, 0x74, 0x65, 0x28, 0x6e, 0x29, 0x7d,
+  0x7d, 0x29, 0x7d, 0x7d, 0x6a, 0x74, 0x28, 0x22, 0x75, 0x6e, 0x6d, 0x6f,
+  0x75, 0x6e, 0x74, 0x22, 0x2c, 0x28, 0x74, 0x2c, 0x6e, 0x29, 0x3d, 0x3e,
+  0x7b, 0x69, 0x66, 0x28, 0x22, 0x73, 0x74, 0x72, 0x69, 0x6e, 0x67, 0x22,
+  0x3d, 0x3d, 0x74, 0x79, 0x70, 0x65, 0x6f, 0x66, 0x20, 0x6e, 0x2e, 0x74,
+  0x79, 0x70, 0x65, 0x29, 0x7b, 0x6c, 0x65, 0x74, 0x20, 0x74, 0x3d, 0x6e,
+  0x2e, 0x5f, 0x5f, 0x65, 0x3b, 0x69, 0x66, 0x28, 0x74, 0x29, 0x7b, 0x63,
+  0x6f, 0x6e, 0x73, 0x74, 0x20, 0x6e, 0x3d, 0x74, 0x2e, 0x55, 0x3b, 0x69,
+  0x66, 0x28, 0x6e, 0x29, 0x7b, 0x74, 0x2e, 0x55, 0x3d, 0x76, 0x6f, 0x69,
+  0x64, 0x20, 0x30, 0x3b, 0x66, 0x6f, 0x72, 0x28, 0x6c, 0x65, 0x74, 0x20,
+  0x74, 0x20, 0x69, 0x6e, 0x20, 0x6e, 0x29, 0x7b, 0x6c, 0x65, 0x74, 0x20,
+  0x65, 0x3d, 0x6e, 0x5b, 0x74, 0x5d, 0x3b, 0x69, 0x66, 0x28, 0x65, 0x29,
+  0x65, 0x2e, 0x64, 0x28, 0x29, 0x7d, 0x7d, 0x7d, 0x7d, 0x65, 0x6c, 0x73,
+  0x65, 0x7b, 0x6c, 0x65, 0x74, 0x20, 0x74, 0x3d, 0x6e, 0x2e, 0x5f, 0x5f,
+  0x63, 0x3b, 0x69, 0x66, 0x28, 0x74, 0x29, 0x7b, 0x63, 0x6f, 0x6e, 0x73,
+  0x74, 0x20, 0x6e, 0x3d, 0x74, 0x2e, 0x5f, 0x5f, 0x24, 0x75, 0x3b, 0x69,
+  0x66, 0x28, 0x6e, 0x29, 0x7b, 0x74, 0x2e, 0x5f, 0x5f, 0x24, 0x75, 0x3d,
+  0x76, 0x6f, 0x69, 0x64, 0x20, 0x30, 0x3b, 0x6e, 0x2e, 0x64, 0x28, 0x29,
+  0x7d, 0x7d, 0x7d, 0x74, 0x28, 0x6e, 0x29, 0x7d, 0x29, 0x3b, 0x6a, 0x74,
+  0x28, 0x22, 0x5f, 0x5f, 0x68, 0x22, 0x2c, 0x28, 0x74, 0x2c, 0x6e, 0x2c,
+  0x65, 0x2c, 0x69, 0x29, 0x3d, 0x3e, 0x7b, 0x69, 0x66, 0x28, 0x69, 0x3c,
+  0x33, 0x29, 0x6e, 0x2e, 0x5f, 0x5f, 0x24, 0x66, 0x7c, 0x3d, 0x32, 0x3b,
+  0x74, 0x28, 0x6e, 0x2c, 0x65, 0x2c, 0x69, 0x29, 0x7d, 0x29, 0x3b, 0x4c,
+  0x2e, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x74, 0x79, 0x70, 0x65, 0x2e, 0x73,
+  0x68, 0x6f, 0x75, 0x6c, 0x64, 0x43, 0x6f, 0x6d, 0x70, 0x6f, 0x6e, 0x65,
+  0x6e, 0x74, 0x55, 0x70, 0x64, 0x61, 0x74, 0x65, 0x3d, 0x66, 0x75, 0x6e,
+  0x63, 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x74, 0x2c, 0x6e, 0x29, 0x7b, 0x63,
+  0x6f, 0x6e, 0x73, 0x74, 0x20, 0x65, 0x3d, 0x74, 0x68, 0x69, 0x73, 0x2e,
+  0x5f, 0x5f, 0x24, 0x75, 0x3b, 0x69, 0x66, 0x28, 0x21, 0x28, 0x65, 0x26,
+  0x26, 0x76, 0x6f, 0x69, 0x64, 0x20, 0x30, 0x21, 0x3d, 0x3d, 0x65, 0x2e,
+  0x73, 0x7c, 0x7c, 0x34, 0x26, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x5f, 0x5f,
+  0x24, 0x66, 0x29, 0x29, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x21, 0x30,
+  0x3b, 0x69, 0x66, 0x28, 0x33, 0x26, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x5f,
+  0x5f, 0x24, 0x66, 0x29, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x21, 0x30,
+  0x3b, 0x66, 0x6f, 0x72, 0x28, 0x6c, 0x65, 0x74, 0x20, 0x69, 0x20, 0x69,
+  0x6e, 0x20, 0x6e, 0x29, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x21, 0x30,
+  0x3b, 0x66, 0x6f, 0x72, 0x28, 0x6c, 0x65, 0x74, 0x20, 0x69, 0x20, 0x69,
+  0x6e, 0x20, 0x74, 0x29, 0x69, 0x66, 0x28, 0x22, 0x5f, 0x5f, 0x73, 0x6f,
+  0x75, 0x72, 0x63, 0x65, 0x22, 0x21, 0x3d, 0x3d, 0x69, 0x26, 0x26, 0x74,
+  0x5b, 0x69, 0x5d, 0x21, 0x3d, 0x3d, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x70,
+  0x72, 0x6f, 0x70, 0x73, 0x5b, 0x69, 0x5d, 0x29, 0x72, 0x65, 0x74, 0x75,
+  0x72, 0x6e, 0x21, 0x30, 0x3b, 0x66, 0x6f, 0x72, 0x28, 0x6c, 0x65, 0x74,
+  0x20, 0x69, 0x20, 0x69, 0x6e, 0x20, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x70,
+  0x72, 0x6f, 0x70, 0x73, 0x29, 0x69, 0x66, 0x28, 0x21, 0x28, 0x69, 0x20,
+  0x69, 0x6e, 0x20, 0x74, 0x29, 0x29, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e,
+  0x21, 0x30, 0x3b, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x21, 0x31, 0x7d,
+  0x3b, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x4b, 0x74,
+  0x28, 0x74, 0x29, 0x7b, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x50,
+  0x74, 0x28, 0x28, 0x29, 0x3d, 0x3e, 0x73, 0x28, 0x74, 0x29, 0x2c, 0x5b,
+  0x5d, 0x29, 0x7d, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20,
+  0x51, 0x74, 0x28, 0x74, 0x29, 0x7b, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20,
+  0x6e, 0x3d, 0x48, 0x74, 0x28, 0x74, 0x29, 0x3b, 0x6e, 0x2e, 0x63, 0x75,
+  0x72, 0x72, 0x65, 0x6e, 0x74, 0x3d, 0x74, 0x3b, 0x71, 0x74, 0x2e, 0x5f,
+  0x5f, 0x24, 0x66, 0x7c, 0x3d, 0x34, 0x3b, 0x72, 0x65, 0x74, 0x75, 0x72,
+  0x6e, 0x20, 0x50, 0x74, 0x28, 0x28, 0x29, 0x3d, 0x3e, 0x64, 0x28, 0x28,
+  0x29, 0x3d, 0x3e, 0x6e, 0x2e, 0x63, 0x75, 0x72, 0x72, 0x65, 0x6e, 0x74,
+  0x28, 0x29, 0x29, 0x2c, 0x5b, 0x5d, 0x29, 0x7d, 0x66, 0x75, 0x6e, 0x63,
+  0x74, 0x69, 0x6f, 0x6e, 0x20, 0x58, 0x74, 0x28, 0x74, 0x29, 0x7b, 0x63,
+  0x6f, 0x6e, 0x73, 0x74, 0x20, 0x6e, 0x3d, 0x48, 0x74, 0x28, 0x74, 0x29,
+  0x3b, 0x6e, 0x2e, 0x63, 0x75, 0x72, 0x72, 0x65, 0x6e, 0x74, 0x3d, 0x74,
+  0x3b, 0x45, 0x74, 0x28, 0x28, 0x29, 0x3d, 0x3e, 0x62, 0x28, 0x28, 0x29,
+  0x3d, 0x3e, 0x6e, 0x2e, 0x63, 0x75, 0x72, 0x72, 0x65, 0x6e, 0x74, 0x28,
+  0x29, 0x29, 0x2c, 0x5b, 0x5d, 0x29, 0x7d, 0x76, 0x61, 0x72, 0x20, 0x59,
+  0x74, 0x3d, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x74,
+  0x2c, 0x6e, 0x2c, 0x65, 0x2c, 0x69, 0x29, 0x7b, 0x76, 0x61, 0x72, 0x20,
+  0x5f, 0x3b, 0x6e, 0x5b, 0x30, 0x5d, 0x3d, 0x30, 0x3b, 0x66, 0x6f, 0x72,
+  0x28, 0x76, 0x61, 0x72, 0x20, 0x6f, 0x3d, 0x31, 0x3b, 0x6f, 0x3c, 0x6e,
+  0x2e, 0x6c, 0x65, 0x6e, 0x67, 0x74, 0x68, 0x3b, 0x6f, 0x2b, 0x2b, 0x29,
+  0x7b, 0x76, 0x61, 0x72, 0x20, 0x72, 0x3d, 0x6e, 0x5b, 0x6f, 0x2b, 0x2b,
+  0x5d, 0x2c, 0x75, 0x3d, 0x6e, 0x5b, 0x6f, 0x5d, 0x3f, 0x28, 0x6e, 0x5b,
+  0x30, 0x5d, 0x7c, 0x3d, 0x72, 0x3f, 0x31, 0x3a, 0x32, 0x2c, 0x65, 0x5b,
+  0x6e, 0x5b, 0x6f, 0x2b, 0x2b, 0x5d, 0x5d, 0x29, 0x3a, 0x6e, 0x5b, 0x2b,
+  0x2b, 0x6f, 0x5d, 0x3b, 0x33, 0x3d, 0x3d, 0x3d, 0x72, 0x3f, 0x69, 0x5b,
+  0x30, 0x5d, 0x3d, 0x75, 0x3a, 0x34, 0x3d, 0x3d, 0x3d, 0x72, 0x3f, 0x69,
+  0x5b, 0x31, 0x5d, 0x3d, 0x4f, 0x62, 0x6a, 0x65, 0x63, 0x74, 0x2e, 0x61,
+  0x73, 0x73, 0x69, 0x67, 0x6e, 0x28, 0x69, 0x5b, 0x31, 0x5d, 0x7c, 0x7c,
+  0x7b, 0x7d, 0x2c, 0x75, 0x29, 0x3a, 0x35, 0x3d, 0x3d, 0x3d, 0x72, 0x3f,
+  0x28, 0x69, 0x5b, 0x31, 0x5d, 0x3d, 0x69, 0x5b, 0x31, 0x5d, 0x7c, 0x7c,
+  0x7b, 0x7d, 0x29, 0x5b, 0x6e, 0x5b, 0x2b, 0x2b, 0x6f, 0x5d, 0x5d, 0x3d,
+  0x75, 0x3a, 0x36, 0x3d, 0x3d, 0x3d, 0x72, 0x3f, 0x69, 0x5b, 0x31, 0x5d,
+  0x5b, 0x6e, 0x5b, 0x2b, 0x2b, 0x6f, 0x5d, 0x5d, 0x2b, 0x3d, 0x75, 0x2b,
+  0x22, 0x22, 0x3a, 0x72, 0x3f, 0x28, 0x5f, 0x3d, 0x74, 0x2e, 0x61, 0x70,
+  0x70, 0x6c, 0x79, 0x28, 0x75, 0x2c, 0x59, 0x74, 0x28, 0x74, 0x2c, 0x75,
+  0x2c, 0x65, 0x2c, 0x5b, 0x22, 0x22, 0x2c, 0x6e, 0x75, 0x6c, 0x6c, 0x5d,
+  0x29, 0x29, 0x2c, 0x69, 0x2e, 0x70, 0x75, 0x73, 0x68, 0x28, 0x5f, 0x29,
+  0x2c, 0x75, 0x5b, 0x30, 0x5d, 0x3f, 0x6e, 0x5b, 0x30, 0x5d, 0x7c, 0x3d,
+  0x32, 0x3a, 0x28, 0x6e, 0x5b, 0x6f, 0x2d, 0x32, 0x5d, 0x3d, 0x30, 0x2c,
+  0x6e, 0x5b, 0x6f, 0x5d, 0x3d, 0x5f, 0x29, 0x29, 0x3a, 0x69, 0x2e, 0x70,
+  0x75, 0x73, 0x68, 0x28, 0x75, 0x29, 0x7d, 0x72, 0x65, 0x74, 0x75, 0x72,
+  0x6e, 0x20, 0x69, 0x7d, 0x2c, 0x5a, 0x74, 0x3d, 0x6e, 0x65, 0x77, 0x20,
+  0x4d, 0x61, 0x70, 0x3b, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e,
+  0x20, 0x74, 0x6e, 0x28, 0x74, 0x29, 0x7b, 0x76, 0x61, 0x72, 0x20, 0x6e,
+  0x3d, 0x5a, 0x74, 0x2e, 0x67, 0x65, 0x74, 0x28, 0x74, 0x68, 0x69, 0x73,
+  0x29, 0x3b, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x6e, 0x7c, 0x7c,
+  0x28, 0x6e, 0x3d, 0x6e, 0x65, 0x77, 0x20, 0x4d, 0x61, 0x70, 0x2c, 0x5a,
+  0x74, 0x2e, 0x73, 0x65, 0x74, 0x28, 0x74, 0x68, 0x69, 0x73, 0x2c, 0x6e,
+  0x29, 0x29, 0x2c, 0x28, 0x6e, 0x3d, 0x59, 0x74, 0x28, 0x74, 0x68, 0x69,
+  0x73, 0x2c, 0x6e, 0x2e, 0x67, 0x65, 0x74, 0x28, 0x74, 0x29, 0x7c, 0x7c,
+  0x28, 0x6e, 0x2e, 0x73, 0x65, 0x74, 0x28, 0x74, 0x2c, 0x6e, 0x3d, 0x66,
+  0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x74, 0x29, 0x7b, 0x66,
+  0x6f, 0x72, 0x28, 0x76, 0x61, 0x72, 0x20, 0x6e, 0x2c, 0x65, 0x2c, 0x69,
+  0x3d, 0x31, 0x2c, 0x5f, 0x3d, 0x22, 0x22, 0x2c, 0x6f, 0x3d, 0x22, 0x22,
+  0x2c, 0x72, 0x3d, 0x5b, 0x30, 0x5d, 0x2c, 0x75, 0x3d, 0x66, 0x75, 0x6e,
+  0x63, 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x74, 0x29, 0x7b, 0x31, 0x3d, 0x3d,
+  0x3d, 0x69, 0x26, 0x26, 0x28, 0x74, 0x7c, 0x7c, 0x28, 0x5f, 0x3d, 0x5f,
+  0x2e, 0x72, 0x65, 0x70, 0x6c, 0x61, 0x63, 0x65, 0x28, 0x2f, 0x5e, 0x5c,
+  0x73, 0x2a, 0x5c, 0x6e, 0x5c, 0x73, 0x2a, 0x7c, 0x5c, 0x73, 0x2a, 0x5c,
+  0x6e, 0x5c, 0x73, 0x2a, 0x24, 0x2f, 0x67, 0x2c, 0x22, 0x22, 0x29, 0x29,
+  0x29, 0x3f, 0x72, 0x2e, 0x70, 0x75, 0x73, 0x68, 0x28, 0x30, 0x2c, 0x74,
+  0x2c, 0x5f, 0x29, 0x3a, 0x33, 0x3d, 0x3d, 0x3d, 0x69, 0x26, 0x26, 0x28,
+  0x74, 0x7c, 0x7c, 0x5f, 0x29, 0x3f, 0x28, 0x72, 0x2e, 0x70, 0x75, 0x73,
+  0x68, 0x28, 0x33, 0x2c, 0x74, 0x2c, 0x5f, 0x29, 0x2c, 0x69, 0x3d, 0x32,
+  0x29, 0x3a, 0x32, 0x3d, 0x3d, 0x3d, 0x69, 0x26, 0x26, 0x22, 0x2e, 0x2e,
+  0x2e, 0x22, 0x3d, 0x3d, 0x3d, 0x5f, 0x26, 0x26, 0x74, 0x3f, 0x72, 0x2e,
+  0x70, 0x75, 0x73, 0x68, 0x28, 0x34, 0x2c, 0x74, 0x2c, 0x30, 0x29, 0x3a,
+  0x32, 0x3d, 0x3d, 0x3d, 0x69, 0x26, 0x26, 0x5f, 0x26, 0x26, 0x21, 0x74,
+  0x3f, 0x72, 0x2e, 0x70, 0x75, 0x73, 0x68, 0x28, 0x35, 0x2c, 0x30, 0x2c,
+  0x21, 0x30, 0x2c, 0x5f, 0x29, 0x3a, 0x69, 0x3e, 0x3d, 0x35, 0x26, 0x26,
+  0x28, 0x28, 0x5f, 0x7c, 0x7c, 0x21, 0x74, 0x26, 0x26, 0x35, 0x3d, 0x3d,
+  0x3d, 0x69, 0x29, 0x26, 0x26, 0x28, 0x72, 0x2e, 0x70, 0x75, 0x73, 0x68,
+  0x28, 0x69, 0x2c, 0x30, 0x2c, 0x5f, 0x2c, 0x65, 0x29, 0x2c, 0x69, 0x3d,
+  0x36, 0x29, 0x2c, 0x74, 0x26, 0x26, 0x28, 0x72, 0x2e, 0x70, 0x75, 0x73,
+  0x68, 0x28, 0x69, 0x2c, 0x74, 0x2c, 0x30, 0x2c, 0x65, 0x29, 0x2c, 0x69,
+  0x3d, 0x36, 0x29, 0x29, 0x2c, 0x5f, 0x3d, 0x22, 0x22, 0x7d, 0x2c, 0x6c,
+  0x3d, 0x30, 0x3b, 0x6c, 0x3c, 0x74, 0x2e, 0x6c, 0x65, 0x6e, 0x67, 0x74,
+  0x68, 0x3b, 0x6c, 0x2b, 0x2b, 0x29, 0x7b, 0x6c, 0x26, 0x26, 0x28, 0x31,
+  0x3d, 0x3d, 0x3d, 0x69, 0x26, 0x26, 0x75, 0x28, 0x29, 0x2c, 0x75, 0x28,
+  0x6c, 0x29, 0x29, 0x3b, 0x66, 0x6f, 0x72, 0x28, 0x76, 0x61, 0x72, 0x20,
+  0x66, 0x3d, 0x30, 0x3b, 0x66, 0x3c, 0x74, 0x5b, 0x6c, 0x5d, 0x2e, 0x6c,
+  0x65, 0x6e, 0x67, 0x74, 0x68, 0x3b, 0x66, 0x2b, 0x2b, 0x29, 0x6e, 0x3d,
+  0x74, 0x5b, 0x6c, 0x5d, 0x5b, 0x66, 0x5d, 0x2c, 0x31, 0x3d, 0x3d, 0x3d,
+  0x69, 0x3f, 0x22, 0x3c, 0x22, 0x3d, 0x3d, 0x3d, 0x6e, 0x3f, 0x28, 0x75,
+  0x28, 0x29, 0x2c, 0x72, 0x3d, 0x5b, 0x72, 0x5d, 0x2c, 0x69, 0x3d, 0x33,
+  0x29, 0x3a, 0x5f, 0x2b, 0x3d, 0x6e, 0x3a, 0x34, 0x3d, 0x3d, 0x3d, 0x69,
+  0x3f, 0x22, 0x2d, 0x2d, 0x22, 0x3d, 0x3d, 0x3d, 0x5f, 0x26, 0x26, 0x22,
+  0x3e, 0x22, 0x3d, 0x3d, 0x3d, 0x6e, 0x3f, 0x28, 0x69, 0x3d, 0x31, 0x2c,
+  0x5f, 0x3d, 0x22, 0x22, 0x29, 0x3a, 0x5f, 0x3d, 0x6e, 0x2b, 0x5f, 0x5b,
+  0x30, 0x5d, 0x3a, 0x6f, 0x3f, 0x6e, 0x3d, 0x3d, 0x3d, 0x6f, 0x3f, 0x6f,
+  0x3d, 0x22, 0x22, 0x3a, 0x5f, 0x2b, 0x3d, 0x6e, 0x3a, 0x27, 0x22, 0x27,
+  0x3d, 0x3d, 0x3d, 0x6e, 0x7c, 0x7c, 0x22, 0x27, 0x22, 0x3d, 0x3d, 0x3d,
+  0x6e, 0x3f, 0x6f, 0x3d, 0x6e, 0x3a, 0x22, 0x3e, 0x22, 0x3d, 0x3d, 0x3d,
+  0x6e, 0x3f, 0x28, 0x75, 0x28, 0x29, 0x2c, 0x69, 0x3d, 0x31, 0x29, 0x3a,
+  0x69, 0x26, 0x26, 0x28, 0x22, 0x3d, 0x22, 0x3d, 0x3d, 0x3d, 0x6e, 0x3f,
+  0x28, 0x69, 0x3d, 0x35, 0x2c, 0x65, 0x3d, 0x5f, 0x2c, 0x5f, 0x3d, 0x22,
+  0x22, 0x29, 0x3a, 0x22, 0x2f, 0x22, 0x3d, 0x3d, 0x3d, 0x6e, 0x26, 0x26,
+  0x28, 0x69, 0x3c, 0x35, 0x7c, 0x7c, 0x22, 0x3e, 0x22, 0x3d, 0x3d, 0x3d,
+  0x74, 0x5b, 0x6c, 0x5d, 0x5b, 0x66, 0x2b, 0x31, 0x5d, 0x29, 0x3f, 0x28,
+  0x75, 0x28, 0x29, 0x2c, 0x33, 0x3d, 0x3d, 0x3d, 0x69, 0x26, 0x26, 0x28,
+  0x72, 0x3d, 0x72, 0x5b, 0x30, 0x5d, 0x29, 0x2c, 0x69, 0x3d, 0x72, 0x2c,
+  0x28, 0x72, 0x3d, 0x72, 0x5b, 0x30, 0x5d, 0x29, 0x2e, 0x70, 0x75, 0x73,
+  0x68, 0x28, 0x32, 0x2c, 0x30, 0x2c, 0x69, 0x29, 0x2c, 0x69, 0x3d, 0x30,
+  0x29, 0x3a, 0x22, 0x20, 0x22, 0x3d, 0x3d, 0x3d, 0x6e, 0x7c, 0x7c, 0x22,
+  0x5c, 0x74, 0x22, 0x3d, 0x3d, 0x3d, 0x6e, 0x7c, 0x7c, 0x22, 0x5c, 0x6e,
+  0x22, 0x3d, 0x3d, 0x3d, 0x6e, 0x7c, 0x7c, 0x22, 0x5c, 0x72, 0x22, 0x3d,
+  0x3d, 0x3d, 0x6e, 0x3f, 0x28, 0x75, 0x28, 0x29, 0x2c, 0x69, 0x3d, 0x32,
+  0x29, 0x3a, 0x5f, 0x2b, 0x3d, 0x6e, 0x29, 0x2c, 0x33, 0x3d, 0x3d, 0x3d,
+  0x69, 0x26, 0x26, 0x22, 0x21, 0x2d, 0x2d, 0x22, 0x3d, 0x3d, 0x3d, 0x5f,
+  0x26, 0x26, 0x28, 0x69, 0x3d, 0x34, 0x2c, 0x72, 0x3d, 0x72, 0x5b, 0x30,
+  0x5d, 0x29, 0x7d, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x75, 0x28,
+  0x29, 0x2c, 0x72, 0x7d, 0x28, 0x74, 0x29, 0x29, 0x2c, 0x6e, 0x29, 0x2c,
+  0x61, 0x72, 0x67, 0x75, 0x6d, 0x65, 0x6e, 0x74, 0x73, 0x2c, 0x5b, 0x5d,
+  0x29, 0x29, 0x2e, 0x6c, 0x65, 0x6e, 0x67, 0x74, 0x68, 0x3e, 0x31, 0x3f,
+  0x6e, 0x3a, 0x6e, 0x5b, 0x30, 0x5d, 0x7d, 0x76, 0x61, 0x72, 0x20, 0x6e,
+  0x6e, 0x3d, 0x74, 0x6e, 0x2e, 0x62, 0x69, 0x6e, 0x64, 0x28, 0x46, 0x29,
+  0x3b, 0x65, 0x78, 0x70, 0x6f, 0x72, 0x74, 0x7b, 0x4c, 0x20, 0x61, 0x73,
+  0x20, 0x43, 0x6f, 0x6d, 0x70, 0x6f, 0x6e, 0x65, 0x6e, 0x74, 0x2c, 0x4f,
+  0x20, 0x61, 0x73, 0x20, 0x46, 0x72, 0x61, 0x67, 0x6d, 0x65, 0x6e, 0x74,
+  0x2c, 0x66, 0x20, 0x61, 0x73, 0x20, 0x53, 0x69, 0x67, 0x6e, 0x61, 0x6c,
+  0x2c, 0x65, 0x20, 0x61, 0x73, 0x20, 0x62, 0x61, 0x74, 0x63, 0x68, 0x2c,
+  0x66, 0x74, 0x20, 0x61, 0x73, 0x20, 0x63, 0x6c, 0x6f, 0x6e, 0x65, 0x45,
+  0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x2c, 0x64, 0x20, 0x61, 0x73, 0x20,
+  0x63, 0x6f, 0x6d, 0x70, 0x75, 0x74, 0x65, 0x64, 0x2c, 0x73, 0x74, 0x20,
+  0x61, 0x73, 0x20, 0x63, 0x72, 0x65, 0x61, 0x74, 0x65, 0x43, 0x6f, 0x6e,
+  0x74, 0x65, 0x78, 0x74, 0x2c, 0x46, 0x20, 0x61, 0x73, 0x20, 0x63, 0x72,
+  0x65, 0x61, 0x74, 0x65, 0x45, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x2c,
+  0x57, 0x20, 0x61, 0x73, 0x20, 0x63, 0x72, 0x65, 0x61, 0x74, 0x65, 0x52,
+  0x65, 0x66, 0x2c, 0x62, 0x20, 0x61, 0x73, 0x20, 0x65, 0x66, 0x66, 0x65,
+  0x63, 0x74, 0x2c, 0x46, 0x20, 0x61, 0x73, 0x20, 0x68, 0x2c, 0x6e, 0x6e,
+  0x20, 0x61, 0x73, 0x20, 0x68, 0x74, 0x6d, 0x6c, 0x2c, 0x6c, 0x74, 0x20,
+  0x61, 0x73, 0x20, 0x68, 0x79, 0x64, 0x72, 0x61, 0x74, 0x65, 0x2c, 0x77,
+  0x20, 0x61, 0x73, 0x20, 0x69, 0x73, 0x56, 0x61, 0x6c, 0x69, 0x64, 0x45,
+  0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x2c, 0x53, 0x20, 0x61, 0x73, 0x20,
+  0x6f, 0x70, 0x74, 0x69, 0x6f, 0x6e, 0x73, 0x2c, 0x75, 0x74, 0x20, 0x61,
+  0x73, 0x20, 0x72, 0x65, 0x6e, 0x64, 0x65, 0x72, 0x2c, 0x73, 0x20, 0x61,
+  0x73, 0x20, 0x73, 0x69, 0x67, 0x6e, 0x61, 0x6c, 0x2c, 0x7a, 0x20, 0x61,
+  0x73, 0x20, 0x74, 0x6f, 0x43, 0x68, 0x69, 0x6c, 0x64, 0x41, 0x72, 0x72,
+  0x61, 0x79, 0x2c, 0x44, 0x74, 0x20, 0x61, 0x73, 0x20, 0x75, 0x73, 0x65,
+  0x43, 0x61, 0x6c, 0x6c, 0x62, 0x61, 0x63, 0x6b, 0x2c, 0x51, 0x74, 0x20,
+  0x61, 0x73, 0x20, 0x75, 0x73, 0x65, 0x43, 0x6f, 0x6d, 0x70, 0x75, 0x74,
+  0x65, 0x64, 0x2c, 0x24, 0x74, 0x20, 0x61, 0x73, 0x20, 0x75, 0x73, 0x65,
+  0x43, 0x6f, 0x6e, 0x74, 0x65, 0x78, 0x74, 0x2c, 0x54, 0x74, 0x20, 0x61,
+  0x73, 0x20, 0x75, 0x73, 0x65, 0x44, 0x65, 0x62, 0x75, 0x67, 0x56, 0x61,
+  0x6c, 0x75, 0x65, 0x2c, 0x45, 0x74, 0x20, 0x61, 0x73, 0x20, 0x75, 0x73,
+  0x65, 0x45, 0x66, 0x66, 0x65, 0x63, 0x74, 0x2c, 0x56, 0x74, 0x20, 0x61,
+  0x73, 0x20, 0x75, 0x73, 0x65, 0x45, 0x72, 0x72, 0x6f, 0x72, 0x42, 0x6f,
+  0x75, 0x6e, 0x64, 0x61, 0x72, 0x79, 0x2c, 0x41, 0x74, 0x20, 0x61, 0x73,
+  0x20, 0x75, 0x73, 0x65, 0x49, 0x64, 0x2c, 0x4e, 0x74, 0x20, 0x61, 0x73,
+  0x20, 0x75, 0x73, 0x65, 0x49, 0x6d, 0x70, 0x65, 0x72, 0x61, 0x74, 0x69,
+  0x76, 0x65, 0x48, 0x61, 0x6e, 0x64, 0x6c, 0x65, 0x2c, 0x55, 0x74, 0x20,
+  0x61, 0x73, 0x20, 0x75, 0x73, 0x65, 0x4c, 0x61, 0x79, 0x6f, 0x75, 0x74,
+  0x45, 0x66, 0x66, 0x65, 0x63, 0x74, 0x2c, 0x50, 0x74, 0x20, 0x61, 0x73,
+  0x20, 0x75, 0x73, 0x65, 0x4d, 0x65, 0x6d, 0x6f, 0x2c, 0x43, 0x74, 0x20,
+  0x61, 0x73, 0x20, 0x75, 0x73, 0x65, 0x52, 0x65, 0x64, 0x75, 0x63, 0x65,
+  0x72, 0x2c, 0x48, 0x74, 0x20, 0x61, 0x73, 0x20, 0x75, 0x73, 0x65, 0x52,
+  0x65, 0x66, 0x2c, 0x4b, 0x74, 0x20, 0x61, 0x73, 0x20, 0x75, 0x73, 0x65,
+  0x53, 0x69, 0x67, 0x6e, 0x61, 0x6c, 0x2c, 0x58, 0x74, 0x20, 0x61, 0x73,
+  0x20, 0x75, 0x73, 0x65, 0x53, 0x69, 0x67, 0x6e, 0x61, 0x6c, 0x45, 0x66,
+  0x66, 0x65, 0x63, 0x74, 0x2c, 0x77, 0x74, 0x20, 0x61, 0x73, 0x20, 0x75,
+  0x73, 0x65, 0x53, 0x74, 0x61, 0x74, 0x65, 0x7d, 0x3b, 0x0a
+};
+unsigned int index_js_len = 22174;
diff --git a/examples/server/public/completion.js b/examples/server/public/completion.js
new file mode 100644
index 000000000..a43d5a7d5
--- /dev/null
+++ b/examples/server/public/completion.js
@@ -0,0 +1,168 @@
+const paramDefaults = {
+  stream: true,
+  n_predict: 500,
+  temperature: 0.2,
+  stop: ["</s>"]
+};
+
+let generation_settings = null;
+
+
+// Completes the prompt as a generator. Recommended for most use cases.
+//
+// Example:
+//
+//    import { llama } from '/completion.js'
+//
+//    const request = llama("Tell me a joke", {n_predict: 800})
+//    for await (const chunk of request) {
+//      document.write(chunk.data.content)
+//    }
+//
+export async function* llama(prompt, params = {}, config = {}) {
+  let controller = config.controller;
+
+  if (!controller) {
+    controller = new AbortController();
+  }
+
+  const completionParams = { ...paramDefaults, ...params, prompt };
+
+  const response = await fetch("/completion", {
+    method: 'POST',
+    body: JSON.stringify(completionParams),
+    headers: {
+      'Connection': 'keep-alive',
+      'Content-Type': 'application/json',
+      'Accept': 'text/event-stream'
+    },
+    signal: controller.signal,
+  });
+
+  const reader = response.body.getReader();
+  const decoder = new TextDecoder();
+
+  let content = "";
+
+  try {
+    let cont = true;
+
+    while (cont) {
+      const result = await reader.read();
+      if (result.done) {
+        break;
+      }
+
+      // sse answers in the form multiple lines of: value\n with data always present as a key. in our case we
+      // mainly care about the data: key here, which we expect as json
+      const text = decoder.decode(result.value);
+
+      // parse all sse events and add them to result
+      const regex = /^(\S+):\s(.*)$/gm;
+      for (const match of text.matchAll(regex)) {
+        result[match[1]] = match[2]
+      }
+
+      // since we know this is llama.cpp, let's just decode the json in data
+      result.data = JSON.parse(result.data);
+      content += result.data.content;
+
+      // yield
+      yield result;
+
+      // if we got a stop token from server, we will break here
+      if (result.data.stop) {
+        if (result.data.generation_settings) {
+          generation_settings = result.data.generation_settings;
+        }
+        break;
+      }
+    }
+  } catch (e) {
+    if (e.name !== 'AbortError') {
+      console.error("llama error: ", e);
+    }
+    throw e;
+  }
+  finally {
+    controller.abort();
+  }
+
+  return content;
+}
+
+// Call llama, return an event target that you can subcribe to
+//
+// Example:
+//
+//    import { llamaEventTarget } from '/completion.js'
+//
+//    const conn = llamaEventTarget(prompt)
+//    conn.addEventListener("message", (chunk) => {
+//      document.write(chunk.detail.content)
+//    })
+//
+export const llamaEventTarget = (prompt, params = {}, config = {}) => {
+  const eventTarget = new EventTarget();
+  (async () => {
+    let content = "";
+    for await (const chunk of llama(prompt, params, config)) {
+      if (chunk.data) {
+        content += chunk.data.content;
+        eventTarget.dispatchEvent(new CustomEvent("message", { detail: chunk.data }));
+      }
+      if (chunk.data.generation_settings) {
+        eventTarget.dispatchEvent(new CustomEvent("generation_settings", { detail: chunk.data.generation_settings }));
+      }
+      if (chunk.data.timings) {
+        eventTarget.dispatchEvent(new CustomEvent("timings", { detail: chunk.data.timings }));
+      }
+    }
+    eventTarget.dispatchEvent(new CustomEvent("done", { detail: { content } }));
+  })();
+  return eventTarget;
+}
+
+// Call llama, return a promise that resolves to the completed text. This does not support streaming
+//
+// Example:
+//
+//     llamaPromise(prompt).then((content) => {
+//       document.write(content)
+//     })
+//
+//     or
+//
+//     const content = await llamaPromise(prompt)
+//     document.write(content)
+//
+export const llamaPromise = (prompt, params = {}, config = {}) => {
+  return new Promise(async (resolve, reject) => {
+    let content = "";
+    try {
+      for await (const chunk of llama(prompt, params, config)) {
+        content += chunk.data.content;
+      }
+      resolve(content);
+    } catch (error) {
+      reject(error);
+    }
+  });
+};
+
+/**
+ * (deprecated)
+ */
+export const llamaComplete = async (params, controller, callback) => {
+  for await (const chunk of llama(params.prompt, params, { controller })) {
+    callback(chunk);
+  }
+}
+
+// Get the model info from the server. This is useful for getting the context window and so on.
+export const llamaModelInfo = async () => {
+  if (!generation_settings) {
+    generation_settings = await fetch("/model.json").then(r => r.json());
+  }
+  return generation_settings;
+}
diff --git a/examples/server/public/index.html b/examples/server/public/index.html
new file mode 100644
index 000000000..8ace0b0af
--- /dev/null
+++ b/examples/server/public/index.html
@@ -0,0 +1,380 @@
+<html>
+
+<head>
+  <meta charset="UTF-8">
+  <meta name="viewport" content="width=device-width, initial-scale=1, maximum-scale=1" />
+  <title>llama.cpp - chat</title>
+
+  <style>
+    body {
+      background-color: #fff;
+      color: #000;
+      font-family: system-ui;
+      font-size: 90%;
+    }
+
+    #container {
+      margin: 0em auto;
+      display: flex;
+      flex-direction: column;
+      justify-content: space-between;
+      height: 100%;
+    }
+
+    main {
+      margin: 3px;
+      display: flex;
+      flex-direction: column;
+      justify-content: space-between;
+      gap: 1em;
+
+      flex-grow: 1;
+      overflow-y: auto;
+
+      border: 1px solid #ccc;
+      border-radius: 5px;
+      padding: 0.5em;
+    }
+
+    body {
+      max-width: 600px;
+      min-width: 300px;
+      line-height: 1.2;
+      margin: 0 auto;
+      padding: 0 0.5em;
+    }
+
+    p {
+      overflow-wrap: break-word;
+      word-wrap: break-word;
+      hyphens: auto;
+      margin-top: 0.5em;
+      margin-bottom: 0.5em;
+    }
+
+    #write form {
+      margin: 1em 0 0 0;
+      display: flex;
+      flex-direction: column;
+      gap: 0.5em;
+      align-items: stretch;
+    }
+
+    .right {
+      display: flex;
+      flex-direction: row;
+      gap: 0.5em;
+      justify-content: flex-end;
+    }
+
+    fieldset {
+      border: none;
+      padding: 0;
+      margin: 0;
+    }
+
+    textarea {
+      padding: 5px;
+      flex-grow: 1;
+      width: 100%;
+    }
+
+    pre code {
+      display: block;
+      background-color: #222;
+      color: #ddd;
+    }
+    code {
+      font-family: monospace;
+      padding: 0.1em 0.3em;
+      border-radius: 3px;
+    }
+
+    fieldset label {
+      margin: 0.5em 0;
+      display: block;
+    }
+
+    header, footer {
+      text-align: center;
+    }
+
+    footer {
+      font-size: 80%;
+      color: #888;
+    }
+  </style>
+
+  <script type="module">
+    import {
+      html, h, signal, effect, computed, render, useSignal, useEffect, useRef
+    } from '/index.js';
+
+    import { llama } from '/completion.js';
+
+    const session = signal({
+      prompt: "This is a conversation between user and llama, a friendly chatbot. respond in simple markdown.",
+      template: "{{prompt}}\n\n{{history}}\n{{char}}:",
+      historyTemplate: "{{name}}: {{message}}",
+      transcript: [],
+      type: "chat",
+      char: "llama",
+      user: "User",
+    })
+
+    const params = signal({
+      n_predict: 400,
+      temperature: 0.7,
+      repeat_last_n: 256,
+      repeat_penalty: 1.18,
+      top_k: 40,
+      top_p: 0.5,
+    })
+
+    const llamaStats = signal(null)
+    const controller = signal(null)
+
+    const generating = computed(() => controller.value == null )
+    const chatStarted = computed(() => session.value.transcript.length > 0)
+
+    const transcriptUpdate = (transcript) => {
+      session.value = {
+        ...session.value,
+        transcript
+      }
+    }
+
+    // simple template replace
+    const template = (str, extraSettings) => {
+      let settings = session.value;
+      if (extraSettings) {
+        settings = { ...settings, ...extraSettings };
+      }
+      return String(str).replaceAll(/\{\{(.*?)\}\}/g, (_, key) => template(settings[key]));
+    }
+
+    // send message to server
+    const chat = async (msg) => {
+      if (controller.value) {
+        console.log('already running...');
+        return;
+      }
+      controller.value = new AbortController();
+
+      transcriptUpdate([...session.value.transcript, ["{{user}}", msg]])
+
+      const prompt = template(session.value.template, {
+        message: msg,
+        history: session.value.transcript.flatMap(([name, message]) => template(session.value.historyTemplate, {name, message})).join("\n"),
+      });
+
+      let currentMessage = '';
+      const history = session.value.transcript
+
+      const llamaParams = {
+        ...params.value,
+        stop: ["</s>", template("{{char}}:"), template("{{user}}:")],
+      }
+
+      for await (const chunk of llama(prompt, llamaParams, { controller: controller.value })) {
+        const data = chunk.data;
+        currentMessage += data.content;
+
+        // remove leading whitespace
+        currentMessage = currentMessage.replace(/^\s+/, "")
+
+        transcriptUpdate([...history, ["{{char}}", currentMessage]])
+
+        if (data.stop) {
+          console.log("Completion finished: '", currentMessage, "', summary: ", data);
+        }
+
+        if (data.timings) {
+          llamaStats.value = data.timings;
+        }
+      }
+
+      controller.value = null;
+    }
+
+    function MessageInput() {
+      const message = useSignal("")
+
+      const stop = (e) => {
+        e.preventDefault();
+        if (controller.value) {
+          controller.value.abort();
+          controller.value = null;
+        }
+      }
+
+      const reset = (e) => {
+        stop(e);
+        transcriptUpdate([]);
+      }
+
+      const submit = (e) => {
+        stop(e);
+        chat(message.value);
+        message.value = "";
+      }
+
+      const enterSubmits = (event) => {
+        if (event.which === 13 && !event.shiftKey) {
+          submit(event);
+        }
+      }
+
+      return html`
+        <form onsubmit=${submit}>
+          <div>
+            <textarea type="text" rows=2 onkeypress=${enterSubmits} value="${message}" oninput=${(e) => message.value = e.target.value} placeholder="Say something..."/>
+          </div>
+          <div class="right">
+            <button type="submit" disabled=${!generating.value} >Send</button>
+            <button onclick=${stop} disabled=${generating}>Stop</button>
+            <button onclick=${reset}>Reset</button>
+          </div>
+        </form>
+      `
+    }
+
+    const ChatLog = (props) => {
+      const messages = session.value.transcript;
+      const container = useRef(null)
+
+      useEffect(() => {
+        // scroll to bottom (if needed)
+        if (container.current && container.current.scrollHeight <= container.current.scrollTop + container.current.offsetHeight + 300) {
+          container.current.scrollTo(0, container.current.scrollHeight)
+        }
+      }, [messages])
+
+      const chatLine = ([user, msg]) => {
+        return html`<p key=${msg}><strong>${template(user)}:</strong> <${Markdownish} text=${template(msg)} /></p>`
+      };
+
+      return html`
+        <section id="chat" ref=${container}>
+          ${messages.flatMap(chatLine)}
+        </section>`;
+    };
+
+    const ConfigForm = (props) => {
+      const updateSession = (el) => session.value = { ...session.value, [el.target.name]: el.target.value }
+      const updateParams = (el) => params.value = { ...params.value, [el.target.name]: el.target.value }
+      const updateParamsFloat = (el) => params.value = { ...params.value, [el.target.name]: parseFloat(el.target.value) }
+
+      return html`
+        <form>
+          <fieldset>
+            <div>
+              <label for="prompt">Prompt</label>
+              <textarea type="text" name="prompt" value="${session.value.prompt}" rows=4 oninput=${updateSession}/>
+            </div>
+
+            <div>
+              <label for="user">User name</label>
+              <input type="text" name="user" value="${session.value.user}" oninput=${updateSession} />
+            </div>
+
+            <div>
+              <label for="bot">Bot name</label>
+              <input type="text" name="char" value="${session.value.char}" oninput=${updateSession} />
+            </div>
+
+            <div>
+              <label for="template">Prompt template</label>
+              <textarea id="template" name="template" value="${session.value.template}" rows=4 oninput=${updateSession}/>
+            </div>
+
+            <div>
+              <label for="template">Chat history template</label>
+              <textarea id="template" name="historyTemplate" value="${session.value.historyTemplate}" rows=1 oninput=${updateSession}/>
+            </div>
+
+            <div>
+              <label for="temperature">Temperature</label>
+              <input type="range" id="temperature" min="0.0" max="1.0" step="0.01" name="temperature" value="${params.value.temperature}" oninput=${updateParamsFloat} />
+              <span>${params.value.temperature}</span>
+            </div>
+
+            <div>
+              <label for="nPredict">Predictions</label>
+              <input type="range" id="nPredict" min="1" max="2048" step="1" name="n_predict" value="${params.value.n_predict}" oninput=${updateParamsFloat} />
+              <span>${params.value.n_predict}</span>
+            </div>
+
+            <div>
+              <label for="repeat_penalty">Penalize repeat sequence</label>
+              <input type="range" id="repeat_penalty" min="0.0" max="2.0" step="0.01" name="repeat_penalty" value="${params.value.repeat_penalty}" oninput=${updateParamsFloat} />
+              <span>${params.value.repeat_penalty}</span>
+            </div>
+
+            <div>
+              <label for="repeat_last_n">Consider N tokens for penalize</label>
+              <input type="range" id="repeat_last_n" min="0.0" max="2048" name="repeat_last_n" value="${params.value.repeat_last_n}" oninput=${updateParamsFloat} />
+              <span>${params.value.repeat_last_n}</span>
+            </div>
+
+          </fieldset>
+        </form>
+      `
+    }
+    // poor mans markdown replacement
+    const Markdownish = (params) => {
+      const md = params.text
+        .replace(/^#{1,6} (.*)$/gim, '<h3>$1</h3>')
+        .replace(/\*\*(.*?)\*\*/g, '<strong>$1</strong>')
+        .replace(/__(.*?)__/g, '<strong>$1</strong>')
+        .replace(/\*(.*?)\*/g, '<em>$1</em>')
+        .replace(/_(.*?)_/g, '<em>$1</em>')
+        .replace(/```.*?\n([\s\S]*?)```/g, '<pre><code>$1</code></pre>')
+        .replace(/`(.*?)`/g, '<code>$1</code>')
+        .replace(/\n/gim, '<br />');
+      return html`<span dangerouslySetInnerHTML=${{ __html: md }} />`;
+    };
+
+    const ModelGenerationInfo = (params) => {
+      if (!llamaStats.value) {
+        return html`<span/>`
+      }
+      return html`
+        <span>
+          ${llamaStats.value.predicted_per_token_ms.toFixed()}ms per token, ${llamaStats.value.predicted_per_second.toFixed(2)} tokens per second
+        </span>
+      `
+    }
+
+    function App(props) {
+
+      return html`
+        <div id="container">
+          <header>
+            <h1>llama.cpp</h1>
+          </header>
+
+          <main id="content">
+            <${chatStarted.value ? ChatLog : ConfigForm} />
+          </main>
+
+          <section id="write">
+            <${MessageInput} />
+          </section>
+
+          <footer>
+            <p><${ModelGenerationInfo} /></p>
+            <p>Powered by <a href="https://github.com/ggerganov/llama.cpp">llama.cpp</a> and <a href="https://ggml.ai">ggml.ai</a>.</p>
+          </footer>
+        </div>
+      `;
+    }
+
+    render(h(App), document.body);
+  </script>
+</head>
+
+<body>
+</body>
+
+</html>
diff --git a/examples/server/public/index.js b/examples/server/public/index.js
new file mode 100644
index 000000000..4fa725a90
--- /dev/null
+++ b/examples/server/public/index.js
@@ -0,0 +1 @@
+function t(){throw new Error("Cycle detected")}function n(){if(o>1){o--;return}let t,n=!1;while(void 0!==_){let i=_;_=void 0;r++;while(void 0!==i){const _=i.o;i.o=void 0;i.f&=-3;if(!(8&i.f)&&c(i))try{i.c()}catch(e){if(!n){t=e;n=!0}}i=_}}r=0;o--;if(n)throw t}function e(t){if(o>0)return t();o++;try{return t()}finally{n()}}let i,_,o=0,r=0,u=0;function l(t){if(void 0===i)return;let n=t.n;if(void 0===n||n.t!==i){n={i:0,S:t,p:i.s,n:void 0,t:i,e:void 0,x:void 0,r:n};if(void 0!==i.s)i.s.n=n;i.s=n;t.n=n;if(32&i.f)t.S(n);return n}else if(-1===n.i){n.i=0;if(void 0!==n.n){n.n.p=n.p;if(void 0!==n.p)n.p.n=n.n;n.p=i.s;n.n=void 0;i.s.n=n;i.s=n}return n}}function f(t){this.v=t;this.i=0;this.n=void 0;this.t=void 0}f.prototype.h=function(){return!0};f.prototype.S=function(t){if(this.t!==t&&void 0===t.e){t.x=this.t;if(void 0!==this.t)this.t.e=t;this.t=t}};f.prototype.U=function(t){if(void 0!==this.t){const n=t.e,e=t.x;if(void 0!==n){n.x=e;t.e=void 0}if(void 0!==e){e.e=n;t.x=void 0}if(t===this.t)this.t=e}};f.prototype.subscribe=function(t){const n=this;return b((function(){const e=n.value,i=32&this.f;this.f&=-33;try{t(e)}finally{this.f|=i}}))};f.prototype.valueOf=function(){return this.value};f.prototype.toString=function(){return this.value+""};f.prototype.toJSON=function(){return this.value};f.prototype.peek=function(){return this.v};Object.defineProperty(f.prototype,"value",{get(){const t=l(this);if(void 0!==t)t.i=this.i;return this.v},set(e){if(i instanceof p)!function(){throw new Error("Computed cannot have side-effects")}();if(e!==this.v){if(r>100)t();this.v=e;this.i++;u++;o++;try{for(let t=this.t;void 0!==t;t=t.x)t.t.N()}finally{n()}}}});function s(t){return new f(t)}function c(t){for(let n=t.s;void 0!==n;n=n.n)if(n.S.i!==n.i||!n.S.h()||n.S.i!==n.i)return!0;return!1}function h(t){for(let n=t.s;void 0!==n;n=n.n){const e=n.S.n;if(void 0!==e)n.r=e;n.S.n=n;n.i=-1;if(void 0===n.n){t.s=n;break}}}function a(t){let n,e=t.s;while(void 0!==e){const t=e.p;if(-1===e.i){e.S.U(e);if(void 0!==t)t.n=e.n;if(void 0!==e.n)e.n.p=t}else n=e;e.S.n=e.r;if(void 0!==e.r)e.r=void 0;e=t}t.s=n}function p(t){f.call(this,void 0);this.x=t;this.s=void 0;this.g=u-1;this.f=4}(p.prototype=new f).h=function(){this.f&=-3;if(1&this.f)return!1;if(32==(36&this.f))return!0;this.f&=-5;if(this.g===u)return!0;this.g=u;this.f|=1;if(this.i>0&&!c(this)){this.f&=-2;return!0}const t=i;try{h(this);i=this;const t=this.x();if(16&this.f||this.v!==t||0===this.i){this.v=t;this.f&=-17;this.i++}}catch(t){this.v=t;this.f|=16;this.i++}i=t;a(this);this.f&=-2;return!0};p.prototype.S=function(t){if(void 0===this.t){this.f|=36;for(let t=this.s;void 0!==t;t=t.n)t.S.S(t)}f.prototype.S.call(this,t)};p.prototype.U=function(t){if(void 0!==this.t){f.prototype.U.call(this,t);if(void 0===this.t){this.f&=-33;for(let t=this.s;void 0!==t;t=t.n)t.S.U(t)}}};p.prototype.N=function(){if(!(2&this.f)){this.f|=6;for(let t=this.t;void 0!==t;t=t.x)t.t.N()}};p.prototype.peek=function(){if(!this.h())t();if(16&this.f)throw this.v;return this.v};Object.defineProperty(p.prototype,"value",{get(){if(1&this.f)t();const n=l(this);this.h();if(void 0!==n)n.i=this.i;if(16&this.f)throw this.v;return this.v}});function d(t){return new p(t)}function v(t){const e=t.u;t.u=void 0;if("function"==typeof e){o++;const _=i;i=void 0;try{e()}catch(n){t.f&=-2;t.f|=8;y(t);throw n}finally{i=_;n()}}}function y(t){for(let n=t.s;void 0!==n;n=n.n)n.S.U(n);t.x=void 0;t.s=void 0;v(t)}function m(t){if(i!==this)throw new Error("Out-of-order effect");a(this);i=t;this.f&=-2;if(8&this.f)y(this);n()}function g(t){this.x=t;this.u=void 0;this.s=void 0;this.o=void 0;this.f=32}g.prototype.c=function(){const t=this.S();try{if(8&this.f)return;if(void 0===this.x)return;const n=this.x();if("function"==typeof n)this.u=n}finally{t()}};g.prototype.S=function(){if(1&this.f)t();this.f|=1;this.f&=-9;v(this);h(this);o++;const n=i;i=this;return m.bind(this,n)};g.prototype.N=function(){if(!(2&this.f)){this.f|=2;this.o=_;_=this}};g.prototype.d=function(){this.f|=8;if(!(1&this.f))y(this)};function b(t){const n=new g(t);try{n.c()}catch(t){n.d();throw t}return n.d.bind(n)}var k,S,x,w,C,E,U,H,N,P={},D=[],$=/acit|ex(?:s|g|n|p|$)|rph|grid|ows|mnc|ntw|ine[ch]|zoo|^ord|itera/i,T=Array.isArray;function V(t,n){for(var e in n)t[e]=n[e];return t}function A(t){var n=t.parentNode;n&&n.removeChild(t)}function F(t,n,e){var i,_,o,r={};for(o in n)"key"==o?i=n[o]:"ref"==o?_=n[o]:r[o]=n[o];if(arguments.length>2&&(r.children=arguments.length>3?k.call(arguments,2):e),"function"==typeof t&&null!=t.defaultProps)for(o in t.defaultProps)void 0===r[o]&&(r[o]=t.defaultProps[o]);return M(t,r,i,_,null)}function M(t,n,e,i,_){var o={type:t,props:n,key:e,ref:i,__k:null,__:null,__b:0,__e:null,__d:void 0,__c:null,__h:null,constructor:void 0,__v:null==_?++x:_};return null==_&&null!=S.vnode&&S.vnode(o),o}function W(){return{current:null}}function O(t){return t.children}function L(t,n){this.props=t,this.context=n}function R(t,n){if(null==n)return t.__?R(t.__,t.__.__k.indexOf(t)+1):null;for(var e;n<t.__k.length;n++)if(null!=(e=t.__k[n])&&null!=e.__e)return e.__e;return"function"==typeof t.type?R(t):null}function I(t){var n,e;if(null!=(t=t.__)&&null!=t.__c){for(t.__e=t.__c.base=null,n=0;n<t.__k.length;n++)if(null!=(e=t.__k[n])&&null!=e.__e){t.__e=t.__c.base=e.__e;break}return I(t)}}function j(t){(!t.__d&&(t.__d=!0)&&C.push(t)&&!q.__r++||E!==S.debounceRendering)&&((E=S.debounceRendering)||U)(q)}function q(){var t,n,e,i,_,o,r,u;for(C.sort(H);t=C.shift();)t.__d&&(n=C.length,i=void 0,_=void 0,r=(o=(e=t).__v).__e,(u=e.__P)&&(i=[],(_=V({},o)).__v=o.__v+1,nt(u,o,_,e.__n,void 0!==u.ownerSVGElement,null!=o.__h?[r]:null,i,null==r?R(o):r,o.__h),et(i,o),o.__e!=r&&I(o)),C.length>n&&C.sort(H));q.__r=0}function B(t,n,e,i,_,o,r,u,l,f){var s,c,h,a,p,d,v,y=i&&i.__k||D,m=y.length;for(e.__k=[],s=0;s<n.length;s++)if(null!=(a=e.__k[s]=null==(a=n[s])||"boolean"==typeof a||"function"==typeof a?null:"string"==typeof a||"number"==typeof a||"bigint"==typeof a?M(null,a,null,null,a):T(a)?M(O,{children:a},null,null,null):a.__b>0?M(a.type,a.props,a.key,a.ref?a.ref:null,a.__v):a)){if(a.__=e,a.__b=e.__b+1,null===(h=y[s])||h&&a.key==h.key&&a.type===h.type)y[s]=void 0;else for(c=0;c<m;c++){if((h=y[c])&&a.key==h.key&&a.type===h.type){y[c]=void 0;break}h=null}nt(t,a,h=h||P,_,o,r,u,l,f),p=a.__e,(c=a.ref)&&h.ref!=c&&(v||(v=[]),h.ref&&v.push(h.ref,null,a),v.push(c,a.__c||p,a)),null!=p?(null==d&&(d=p),"function"==typeof a.type&&a.__k===h.__k?a.__d=l=G(a,l,t):l=J(t,a,h,y,p,l),"function"==typeof e.type&&(e.__d=l)):l&&h.__e==l&&l.parentNode!=t&&(l=R(h))}for(e.__e=d,s=m;s--;)null!=y[s]&&("function"==typeof e.type&&null!=y[s].__e&&y[s].__e==e.__d&&(e.__d=K(i).nextSibling),ot(y[s],y[s]));if(v)for(s=0;s<v.length;s++)_t(v[s],v[++s],v[++s])}function G(t,n,e){for(var i,_=t.__k,o=0;_&&o<_.length;o++)(i=_[o])&&(i.__=t,n="function"==typeof i.type?G(i,n,e):J(e,i,i,_,i.__e,n));return n}function z(t,n){return n=n||[],null==t||"boolean"==typeof t||(T(t)?t.some((function(t){z(t,n)})):n.push(t)),n}function J(t,n,e,i,_,o){var r,u,l;if(void 0!==n.__d)r=n.__d,n.__d=void 0;else if(null==e||_!=o||null==_.parentNode)t:if(null==o||o.parentNode!==t)t.appendChild(_),r=null;else{for(u=o,l=0;(u=u.nextSibling)&&l<i.length;l+=1)if(u==_)break t;t.insertBefore(_,o),r=o}return void 0!==r?r:_.nextSibling}function K(t){var n,e,i;if(null==t.type||"string"==typeof t.type)return t.__e;if(t.__k)for(n=t.__k.length-1;n>=0;n--)if((e=t.__k[n])&&(i=K(e)))return i;return null}function Q(t,n,e,i,_){var o;for(o in e)"children"===o||"key"===o||o in n||Y(t,o,null,e[o],i);for(o in n)_&&"function"!=typeof n[o]||"children"===o||"key"===o||"value"===o||"checked"===o||e[o]===n[o]||Y(t,o,n[o],e[o],i)}function X(t,n,e){"-"===n[0]?t.setProperty(n,null==e?"":e):t[n]=null==e?"":"number"!=typeof e||$.test(n)?e:e+"px"}function Y(t,n,e,i,_){var o;t:if("style"===n)if("string"==typeof e)t.style.cssText=e;else{if("string"==typeof i&&(t.style.cssText=i=""),i)for(n in i)e&&n in e||X(t.style,n,"");if(e)for(n in e)i&&e[n]===i[n]||X(t.style,n,e[n])}else if("o"===n[0]&&"n"===n[1])o=n!==(n=n.replace(/Capture$/,"")),n=n.toLowerCase()in t?n.toLowerCase().slice(2):n.slice(2),t.l||(t.l={}),t.l[n+o]=e,e?i||t.addEventListener(n,o?tt:Z,o):t.removeEventListener(n,o?tt:Z,o);else if("dangerouslySetInnerHTML"!==n){if(_)n=n.replace(/xlink(H|:h)/,"h").replace(/sName$/,"s");else if("width"!==n&&"height"!==n&&"href"!==n&&"list"!==n&&"form"!==n&&"tabIndex"!==n&&"download"!==n&&"rowSpan"!==n&&"colSpan"!==n&&n in t)try{t[n]=null==e?"":e;break t}catch(t){}"function"==typeof e||(null==e||!1===e&&"-"!==n[4]?t.removeAttribute(n):t.setAttribute(n,e))}}function Z(t){return this.l[t.type+!1](S.event?S.event(t):t)}function tt(t){return this.l[t.type+!0](S.event?S.event(t):t)}function nt(t,n,e,i,_,o,r,u,l){var f,s,c,h,a,p,d,v,y,m,g,b,k,x,w,C=n.type;if(void 0!==n.constructor)return null;null!=e.__h&&(l=e.__h,u=n.__e=e.__e,n.__h=null,o=[u]),(f=S.__b)&&f(n);try{t:if("function"==typeof C){if(v=n.props,y=(f=C.contextType)&&i[f.__c],m=f?y?y.props.value:f.__:i,e.__c?d=(s=n.__c=e.__c).__=s.__E:("prototype"in C&&C.prototype.render?n.__c=s=new C(v,m):(n.__c=s=new L(v,m),s.constructor=C,s.render=rt),y&&y.sub(s),s.props=v,s.state||(s.state={}),s.context=m,s.__n=i,c=s.__d=!0,s.__h=[],s._sb=[]),null==s.__s&&(s.__s=s.state),null!=C.getDerivedStateFromProps&&(s.__s==s.state&&(s.__s=V({},s.__s)),V(s.__s,C.getDerivedStateFromProps(v,s.__s))),h=s.props,a=s.state,s.__v=n,c)null==C.getDerivedStateFromProps&&null!=s.componentWillMount&&s.componentWillMount(),null!=s.componentDidMount&&s.__h.push(s.componentDidMount);else{if(null==C.getDerivedStateFromProps&&v!==h&&null!=s.componentWillReceiveProps&&s.componentWillReceiveProps(v,m),!s.__e&&null!=s.shouldComponentUpdate&&!1===s.shouldComponentUpdate(v,s.__s,m)||n.__v===e.__v){for(n.__v!==e.__v&&(s.props=v,s.state=s.__s,s.__d=!1),s.__e=!1,n.__e=e.__e,n.__k=e.__k,n.__k.forEach((function(t){t&&(t.__=n)})),g=0;g<s._sb.length;g++)s.__h.push(s._sb[g]);s._sb=[],s.__h.length&&r.push(s);break t}null!=s.componentWillUpdate&&s.componentWillUpdate(v,s.__s,m),null!=s.componentDidUpdate&&s.__h.push((function(){s.componentDidUpdate(h,a,p)}))}if(s.context=m,s.props=v,s.__P=t,b=S.__r,k=0,"prototype"in C&&C.prototype.render){for(s.state=s.__s,s.__d=!1,b&&b(n),f=s.render(s.props,s.state,s.context),x=0;x<s._sb.length;x++)s.__h.push(s._sb[x]);s._sb=[]}else do{s.__d=!1,b&&b(n),f=s.render(s.props,s.state,s.context),s.state=s.__s}while(s.__d&&++k<25);s.state=s.__s,null!=s.getChildContext&&(i=V(V({},i),s.getChildContext())),c||null==s.getSnapshotBeforeUpdate||(p=s.getSnapshotBeforeUpdate(h,a)),B(t,T(w=null!=f&&f.type===O&&null==f.key?f.props.children:f)?w:[w],n,e,i,_,o,r,u,l),s.base=n.__e,n.__h=null,s.__h.length&&r.push(s),d&&(s.__E=s.__=null),s.__e=!1}else null==o&&n.__v===e.__v?(n.__k=e.__k,n.__e=e.__e):n.__e=it(e.__e,n,e,i,_,o,r,l);(f=S.diffed)&&f(n)}catch(t){n.__v=null,(l||null!=o)&&(n.__e=u,n.__h=!!l,o[o.indexOf(u)]=null),S.__e(t,n,e)}}function et(t,n){S.__c&&S.__c(n,t),t.some((function(n){try{t=n.__h,n.__h=[],t.some((function(t){t.call(n)}))}catch(t){S.__e(t,n.__v)}}))}function it(t,n,e,i,_,o,r,u){var l,f,s,c=e.props,h=n.props,a=n.type,p=0;if("svg"===a&&(_=!0),null!=o)for(;p<o.length;p++)if((l=o[p])&&"setAttribute"in l==!!a&&(a?l.localName===a:3===l.nodeType)){t=l,o[p]=null;break}if(null==t){if(null===a)return document.createTextNode(h);t=_?document.createElementNS("http://www.w3.org/2000/svg",a):document.createElement(a,h.is&&h),o=null,u=!1}if(null===a)c===h||u&&t.data===h||(t.data=h);else{if(o=o&&k.call(t.childNodes),f=(c=e.props||P).dangerouslySetInnerHTML,s=h.dangerouslySetInnerHTML,!u){if(null!=o)for(c={},p=0;p<t.attributes.length;p++)c[t.attributes[p].name]=t.attributes[p].value;(s||f)&&(s&&(f&&s.__html==f.__html||s.__html===t.innerHTML)||(t.innerHTML=s&&s.__html||""))}if(Q(t,h,c,_,u),s)n.__k=[];else if(B(t,T(p=n.props.children)?p:[p],n,e,i,_&&"foreignObject"!==a,o,r,o?o[0]:e.__k&&R(e,0),u),null!=o)for(p=o.length;p--;)null!=o[p]&&A(o[p]);u||("value"in h&&void 0!==(p=h.value)&&(p!==t.value||"progress"===a&&!p||"option"===a&&p!==c.value)&&Y(t,"value",p,c.value,!1),"checked"in h&&void 0!==(p=h.checked)&&p!==t.checked&&Y(t,"checked",p,c.checked,!1))}return t}function _t(t,n,e){try{"function"==typeof t?t(n):t.current=n}catch(t){S.__e(t,e)}}function ot(t,n,e){var i,_;if(S.unmount&&S.unmount(t),(i=t.ref)&&(i.current&&i.current!==t.__e||_t(i,null,n)),null!=(i=t.__c)){if(i.componentWillUnmount)try{i.componentWillUnmount()}catch(t){S.__e(t,n)}i.base=i.__P=null,t.__c=void 0}if(i=t.__k)for(_=0;_<i.length;_++)i[_]&&ot(i[_],n,e||"function"!=typeof t.type);e||null==t.__e||A(t.__e),t.__=t.__e=t.__d=void 0}function rt(t,n,e){return this.constructor(t,e)}function ut(t,n,e){var i,_,o;S.__&&S.__(t,n),_=(i="function"==typeof e)?null:e&&e.__k||n.__k,o=[],nt(n,t=(!i&&e||n).__k=F(O,null,[t]),_||P,P,void 0!==n.ownerSVGElement,!i&&e?[e]:_?null:n.firstChild?k.call(n.childNodes):null,o,!i&&e?e:_?_.__e:n.firstChild,i),et(o,t)}function lt(t,n){ut(t,n,lt)}function ft(t,n,e){var i,_,o,r,u=V({},t.props);for(o in t.type&&t.type.defaultProps&&(r=t.type.defaultProps),n)"key"==o?i=n[o]:"ref"==o?_=n[o]:u[o]=void 0===n[o]&&void 0!==r?r[o]:n[o];return arguments.length>2&&(u.children=arguments.length>3?k.call(arguments,2):e),M(t.type,u,i||t.key,_||t.ref,null)}function st(t,n){var e={__c:n="__cC"+N++,__:t,Consumer:function(t,n){return t.children(n)},Provider:function(t){var e,i;return this.getChildContext||(e=[],(i={})[n]=this,this.getChildContext=function(){return i},this.shouldComponentUpdate=function(t){this.props.value!==t.value&&e.some((function(t){t.__e=!0,j(t)}))},this.sub=function(t){e.push(t);var n=t.componentWillUnmount;t.componentWillUnmount=function(){e.splice(e.indexOf(t),1),n&&n.call(t)}}),t.children}};return e.Provider.__=e.Consumer.contextType=e}k=D.slice,S={__e:function(t,n,e,i){for(var _,o,r;n=n.__;)if((_=n.__c)&&!_.__)try{if((o=_.constructor)&&null!=o.getDerivedStateFromError&&(_.setState(o.getDerivedStateFromError(t)),r=_.__d),null!=_.componentDidCatch&&(_.componentDidCatch(t,i||{}),r=_.__d),r)return _.__E=_}catch(n){t=n}throw t}},x=0,w=function(t){return null!=t&&void 0===t.constructor},L.prototype.setState=function(t,n){var e;e=null!=this.__s&&this.__s!==this.state?this.__s:this.__s=V({},this.state),"function"==typeof t&&(t=t(V({},e),this.props)),t&&V(e,t),null!=t&&this.__v&&(n&&this._sb.push(n),j(this))},L.prototype.forceUpdate=function(t){this.__v&&(this.__e=!0,t&&this.__h.push(t),j(this))},L.prototype.render=O,C=[],U="function"==typeof Promise?Promise.prototype.then.bind(Promise.resolve()):setTimeout,H=function(t,n){return t.__v.__b-n.__v.__b},q.__r=0,N=0;var ct,ht,at,pt,dt=0,vt=[],yt=[],mt=S.__b,gt=S.__r,bt=S.diffed,kt=S.__c,St=S.unmount;function xt(t,n){S.__h&&S.__h(ht,t,dt||n),dt=0;var e=ht.__H||(ht.__H={__:[],__h:[]});return t>=e.__.length&&e.__.push({__V:yt}),e.__[t]}function wt(t){return dt=1,Ct(It,t)}function Ct(t,n,e){var i=xt(ct++,2);if(i.t=t,!i.__c&&(i.__=[e?e(n):It(void 0,n),function(t){var n=i.__N?i.__N[0]:i.__[0],e=i.t(n,t);n!==e&&(i.__N=[e,i.__[1]],i.__c.setState({}))}],i.__c=ht,!ht.u)){var _=function(t,n,e){if(!i.__c.__H)return!0;var _=i.__c.__H.__.filter((function(t){return t.__c}));if(_.every((function(t){return!t.__N})))return!o||o.call(this,t,n,e);var r=!1;return _.forEach((function(t){if(t.__N){var n=t.__[0];t.__=t.__N,t.__N=void 0,n!==t.__[0]&&(r=!0)}})),!(!r&&i.__c.props===t)&&(!o||o.call(this,t,n,e))};ht.u=!0;var o=ht.shouldComponentUpdate,r=ht.componentWillUpdate;ht.componentWillUpdate=function(t,n,e){if(this.__e){var i=o;o=void 0,_(t,n,e),o=i}r&&r.call(this,t,n,e)},ht.shouldComponentUpdate=_}return i.__N||i.__}function Et(t,n){var e=xt(ct++,3);!S.__s&&Rt(e.__H,n)&&(e.__=t,e.i=n,ht.__H.__h.push(e))}function Ut(t,n){var e=xt(ct++,4);!S.__s&&Rt(e.__H,n)&&(e.__=t,e.i=n,ht.__h.push(e))}function Ht(t){return dt=5,Pt((function(){return{current:t}}),[])}function Nt(t,n,e){dt=6,Ut((function(){return"function"==typeof t?(t(n()),function(){return t(null)}):t?(t.current=n(),function(){return t.current=null}):void 0}),null==e?e:e.concat(t))}function Pt(t,n){var e=xt(ct++,7);return Rt(e.__H,n)?(e.__V=t(),e.i=n,e.__h=t,e.__V):e.__}function Dt(t,n){return dt=8,Pt((function(){return t}),n)}function $t(t){var n=ht.context[t.__c],e=xt(ct++,9);return e.c=t,n?(null==e.__&&(e.__=!0,n.sub(ht)),n.props.value):t.__}function Tt(t,n){S.useDebugValue&&S.useDebugValue(n?n(t):t)}function Vt(t){var n=xt(ct++,10),e=wt();return n.__=t,ht.componentDidCatch||(ht.componentDidCatch=function(t,i){n.__&&n.__(t,i),e[1](t)}),[e[0],function(){e[1](void 0)}]}function At(){var t=xt(ct++,11);if(!t.__){for(var n=ht.__v;null!==n&&!n.__m&&null!==n.__;)n=n.__;var e=n.__m||(n.__m=[0,0]);t.__="P"+e[0]+"-"+e[1]++}return t.__}function Ft(){for(var t;t=vt.shift();)if(t.__P&&t.__H)try{t.__H.__h.forEach(Ot),t.__H.__h.forEach(Lt),t.__H.__h=[]}catch(u){t.__H.__h=[],S.__e(u,t.__v)}}S.__b=function(t){ht=null,mt&&mt(t)},S.__r=function(t){gt&&gt(t),ct=0;var n=(ht=t.__c).__H;n&&(at===ht?(n.__h=[],ht.__h=[],n.__.forEach((function(t){t.__N&&(t.__=t.__N),t.__V=yt,t.__N=t.i=void 0}))):(n.__h.forEach(Ot),n.__h.forEach(Lt),n.__h=[],ct=0)),at=ht},S.diffed=function(t){bt&&bt(t);var n=t.__c;n&&n.__H&&(n.__H.__h.length&&(1!==vt.push(n)&&pt===S.requestAnimationFrame||((pt=S.requestAnimationFrame)||Wt)(Ft)),n.__H.__.forEach((function(t){t.i&&(t.__H=t.i),t.__V!==yt&&(t.__=t.__V),t.i=void 0,t.__V=yt}))),at=ht=null},S.__c=function(t,n){n.some((function(t){try{t.__h.forEach(Ot),t.__h=t.__h.filter((function(t){return!t.__||Lt(t)}))}catch(s){n.some((function(t){t.__h&&(t.__h=[])})),n=[],S.__e(s,t.__v)}})),kt&&kt(t,n)},S.unmount=function(t){St&&St(t);var n,e=t.__c;e&&e.__H&&(e.__H.__.forEach((function(t){try{Ot(t)}catch(t){n=t}})),e.__H=void 0,n&&S.__e(n,e.__v))};var Mt="function"==typeof requestAnimationFrame;function Wt(t){var n,e=function(){clearTimeout(i),Mt&&cancelAnimationFrame(n),setTimeout(t)},i=setTimeout(e,100);Mt&&(n=requestAnimationFrame(e))}function Ot(t){var n=ht,e=t.__c;"function"==typeof e&&(t.__c=void 0,e()),ht=n}function Lt(t){var n=ht;t.__c=t.__(),ht=n}function Rt(t,n){return!t||t.length!==n.length||n.some((function(n,e){return n!==t[e]}))}function It(t,n){return"function"==typeof n?n(t):n}function jt(t,n){S[t]=n.bind(null,S[t]||(()=>{}))}let qt,Bt;function Gt(t){if(Bt)Bt();Bt=t&&t.S()}function zt({data:t}){const n=Kt(t);n.value=t;const e=Pt(()=>{let t=this.__v;while(t=t.__)if(t.__c){t.__c.__$f|=4;break}this.__$u.c=()=>{this.base.data=e.peek()};return d(()=>{let t=n.value.value;return 0===t?0:!0===t?"":t||""})},[]);return e.value}zt.displayName="_st";Object.defineProperties(f.prototype,{constructor:{configurable:!0,value:void 0},type:{configurable:!0,value:zt},props:{configurable:!0,get(){return{data:this}}},__b:{configurable:!0,value:1}});jt("__b",(t,n)=>{if("string"==typeof n.type){let t,e=n.props;for(let i in e){if("children"===i)continue;let _=e[i];if(_ instanceof f){if(!t)n.__np=t={};t[i]=_;e[i]=_.peek()}}}t(n)});jt("__r",(t,n)=>{Gt();let e,i=n.__c;if(i){i.__$f&=-2;e=i.__$u;if(void 0===e)i.__$u=e=function(t){let n;b((function(){n=this}));n.c=()=>{i.__$f|=1;i.setState({})};return n}()}qt=i;Gt(e);t(n)});jt("__e",(t,n,e,i)=>{Gt();qt=void 0;t(n,e,i)});jt("diffed",(t,n)=>{Gt();qt=void 0;let e;if("string"==typeof n.type&&(e=n.__e)){let t=n.__np,i=n.props;if(t){let n=e.U;if(n)for(let e in n){let i=n[e];if(void 0!==i&&!(e in t)){i.d();n[e]=void 0}}else{n={};e.U=n}for(let _ in t){let o=n[_],r=t[_];if(void 0===o){o=Jt(e,_,r,i);n[_]=o}else o.o(r,i)}}}t(n)});function Jt(t,n,e,i){const _=n in t&&void 0===t.ownerSVGElement,o=s(e);return{o:(t,n)=>{o.value=t;i=n},d:b(()=>{const e=o.value.value;if(i[n]!==e){i[n]=e;if(_)t[n]=e;else if(e)t.setAttribute(n,e);else t.removeAttribute(n)}})}}jt("unmount",(t,n)=>{if("string"==typeof n.type){let t=n.__e;if(t){const n=t.U;if(n){t.U=void 0;for(let t in n){let e=n[t];if(e)e.d()}}}}else{let t=n.__c;if(t){const n=t.__$u;if(n){t.__$u=void 0;n.d()}}}t(n)});jt("__h",(t,n,e,i)=>{if(i<3)n.__$f|=2;t(n,e,i)});L.prototype.shouldComponentUpdate=function(t,n){const e=this.__$u;if(!(e&&void 0!==e.s||4&this.__$f))return!0;if(3&this.__$f)return!0;for(let i in n)return!0;for(let i in t)if("__source"!==i&&t[i]!==this.props[i])return!0;for(let i in this.props)if(!(i in t))return!0;return!1};function Kt(t){return Pt(()=>s(t),[])}function Qt(t){const n=Ht(t);n.current=t;qt.__$f|=4;return Pt(()=>d(()=>n.current()),[])}function Xt(t){const n=Ht(t);n.current=t;Et(()=>b(()=>n.current()),[])}var Yt=function(t,n,e,i){var _;n[0]=0;for(var o=1;o<n.length;o++){var r=n[o++],u=n[o]?(n[0]|=r?1:2,e[n[o++]]):n[++o];3===r?i[0]=u:4===r?i[1]=Object.assign(i[1]||{},u):5===r?(i[1]=i[1]||{})[n[++o]]=u:6===r?i[1][n[++o]]+=u+"":r?(_=t.apply(u,Yt(t,u,e,["",null])),i.push(_),u[0]?n[0]|=2:(n[o-2]=0,n[o]=_)):i.push(u)}return i},Zt=new Map;function tn(t){var n=Zt.get(this);return n||(n=new Map,Zt.set(this,n)),(n=Yt(this,n.get(t)||(n.set(t,n=function(t){for(var n,e,i=1,_="",o="",r=[0],u=function(t){1===i&&(t||(_=_.replace(/^\s*\n\s*|\s*\n\s*$/g,"")))?r.push(0,t,_):3===i&&(t||_)?(r.push(3,t,_),i=2):2===i&&"..."===_&&t?r.push(4,t,0):2===i&&_&&!t?r.push(5,0,!0,_):i>=5&&((_||!t&&5===i)&&(r.push(i,0,_,e),i=6),t&&(r.push(i,t,0,e),i=6)),_=""},l=0;l<t.length;l++){l&&(1===i&&u(),u(l));for(var f=0;f<t[l].length;f++)n=t[l][f],1===i?"<"===n?(u(),r=[r],i=3):_+=n:4===i?"--"===_&&">"===n?(i=1,_=""):_=n+_[0]:o?n===o?o="":_+=n:'"'===n||"'"===n?o=n:">"===n?(u(),i=1):i&&("="===n?(i=5,e=_,_=""):"/"===n&&(i<5||">"===t[l][f+1])?(u(),3===i&&(r=r[0]),i=r,(r=r[0]).push(2,0,i),i=0):" "===n||"\t"===n||"\n"===n||"\r"===n?(u(),i=2):_+=n),3===i&&"!--"===_&&(i=4,r=r[0])}return u(),r}(t)),n),arguments,[])).length>1?n:n[0]}var nn=tn.bind(F);export{L as Component,O as Fragment,f as Signal,e as batch,ft as cloneElement,d as computed,st as createContext,F as createElement,W as createRef,b as effect,F as h,nn as html,lt as hydrate,w as isValidElement,S as options,ut as render,s as signal,z as toChildArray,Dt as useCallback,Qt as useComputed,$t as useContext,Tt as useDebugValue,Et as useEffect,Vt as useErrorBoundary,At as useId,Nt as useImperativeHandle,Ut as useLayoutEffect,Pt as useMemo,Ct as useReducer,Ht as useRef,Kt as useSignal,Xt as useSignalEffect,wt as useState};
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index 998d55eac..f442f2b56 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -2,8 +2,6 @@
 #include "llama.h"
 #include "build-info.h"
 
-// single thread
-#define CPPHTTPLIB_THREAD_POOL_COUNT 1
 #ifndef NDEBUG
 // crash the server in debug mode, otherwise send an http 500 error
 #define CPPHTTPLIB_NO_EXCEPTIONS 1
@@ -12,6 +10,11 @@
 #include "httplib.h"
 #include "json.hpp"
 
+// auto generated files (update with ./deps.sh)
+#include "index.html.hpp"
+#include "index.js.hpp"
+#include "completion.js.hpp"
+
 #ifndef SERVER_VERBOSE
 #define SERVER_VERBOSE 1
 #endif
@@ -19,37 +22,62 @@
 using namespace httplib;
 using json = nlohmann::json;
 
-struct server_params {
+struct server_params
+{
     std::string hostname = "127.0.0.1";
+    std::string public_path = "examples/server/public";
     int32_t port = 8080;
     int32_t read_timeout = 600;
     int32_t write_timeout = 600;
 };
 
-static size_t common_part(const std::vector<llama_token> & a, const std::vector<llama_token> & b) {
+// completion token output with probabilities
+struct completion_token_output
+{
+    struct token_prob
+    {
+        llama_token tok;
+        float prob;
+    };
+
+    std::vector<token_prob> probs;
+    llama_token tok;
+};
+
+static size_t common_part(const std::vector<llama_token> &a, const std::vector<llama_token> &b)
+{
     size_t i;
-    for (i = 0; i < a.size() && i < b.size() && a[i] == b[i]; i++) {}
+    for (i = 0; i < a.size() && i < b.size() && a[i] == b[i]; i++)
+    {
+    }
     return i;
 }
 
-enum stop_type {
+enum stop_type
+{
     STOP_FULL,
     STOP_PARTIAL,
 };
 
-static bool ends_with(const std::string & str, const std::string & suffix) {
+static bool ends_with(const std::string &str, const std::string &suffix)
+{
     return str.size() >= suffix.size() &&
-        0 == str.compare(str.size() - suffix.size(), suffix.size(), suffix);
+           0 == str.compare(str.size() - suffix.size(), suffix.size(), suffix);
 }
 
-static size_t find_partial_stop_string(const std::string & stop,
-                                       const std::string & text) {
-    if (!text.empty() && !stop.empty()) {
+static size_t find_partial_stop_string(const std::string &stop,
+                                       const std::string &text)
+{
+    if (!text.empty() && !stop.empty())
+    {
         const char text_last_char = text.back();
-        for (int64_t char_index = stop.size() - 1; char_index >= 0; char_index--) {
-            if (stop[char_index] == text_last_char) {
+        for (int64_t char_index = stop.size() - 1; char_index >= 0; char_index--)
+        {
+            if (stop[char_index] == text_last_char)
+            {
                 const std::string current_partial = stop.substr(0, char_index + 1);
-                if (ends_with(text, current_partial)) {
+                if (ends_with(text, current_partial))
+                {
                     return text.size() - char_index - 1;
                 }
             }
@@ -58,26 +86,30 @@ static size_t find_partial_stop_string(const std::string & stop,
     return std::string::npos;
 }
 
-template<class Iter>
-static std::string tokens_to_str(llama_context * ctx, Iter begin, Iter end) {
+template <class Iter>
+static std::string tokens_to_str(llama_context *ctx, Iter begin, Iter end)
+{
     std::string ret;
-    for (; begin != end; ++begin) {
+    for (; begin != end; ++begin)
+    {
         ret += llama_token_to_str(ctx, *begin);
     }
     return ret;
 }
 
-static void server_log(const char * level, const char * function, int line,
-                       const char * message, const nlohmann::ordered_json & extra) {
-    nlohmann::ordered_json log {
-        { "timestamp", time(nullptr) },
-        { "level", level },
-        { "function", function },
-        { "line", line },
-        { "message", message },
+static void server_log(const char *level, const char *function, int line,
+                       const char *message, const nlohmann::ordered_json &extra)
+{
+    nlohmann::ordered_json log{
+        {"timestamp", time(nullptr)},
+        {"level", level},
+        {"function", function},
+        {"line", line},
+        {"message", message},
     };
 
-    if (!extra.empty()) {
+    if (!extra.empty())
+    {
         log.merge_patch(extra);
     }
 
@@ -86,28 +118,72 @@ static void server_log(const char * level, const char * function, int line,
     fflush(stdout);
 }
 
+// format incomplete utf-8 multibyte character for output
+static std::string tokens_to_output_formatted_string(const llama_context *ctx, const llama_token token)
+{
+    std::string out = token == -1 ? "" : llama_token_to_str(ctx, token);
+    // if first bit is 1, meaning it's a partial character
+    if (out.size() > 0 && (out[0] & 0x80) == 0x80)
+    {
+        std::stringstream ss;
+        ss << std::hex << (out[0] & 0xff);
+        std::string res(ss.str());
+        out = "byte: \\x" + res;
+    }
+    return out;
+}
+
+// convert a vector of completion_token_output to json
+static json probs_vector_to_json(const llama_context *ctx, const std::vector<completion_token_output> probs)
+{
+    json out = json::array();
+    for (const auto &prob : probs)
+    {
+        json probs_for_token = json::array();
+        for (const auto &p : prob.probs)
+        {
+            std::string tok_str = tokens_to_output_formatted_string(ctx, p.tok);
+            probs_for_token.push_back(json{
+                {"tok_str", tok_str},
+                {"prob", p.prob},
+            });
+        }
+        std::string tok_str = tokens_to_output_formatted_string(ctx, prob.tok);
+        out.push_back(json{
+            {"content", tok_str},
+            {"probs", probs_for_token},
+        });
+    }
+    return out;
+}
+
 static bool server_verbose = false;
 
 #if SERVER_VERBOSE != 1
-#  define LOG_VERBOSE(MSG, ...)
+#define LOG_VERBOSE(MSG, ...)
 #else
-#  define LOG_VERBOSE(MSG, ...)                                          \
-    do {                                                                 \
-        if (server_verbose) {                                            \
+#define LOG_VERBOSE(MSG, ...)                                            \
+    do                                                                   \
+    {                                                                    \
+        if (server_verbose)                                              \
+        {                                                                \
             server_log("VERBOSE", __func__, __LINE__, MSG, __VA_ARGS__); \
         }                                                                \
-    } while(0)
+    } while (0)
 #endif
 
 #define LOG_ERROR(MSG, ...) server_log("ERROR", __func__, __LINE__, MSG, __VA_ARGS__)
 #define LOG_WARNING(MSG, ...) server_log("WARNING", __func__, __LINE__, MSG, __VA_ARGS__)
 #define LOG_INFO(MSG, ...) server_log("INFO", __func__, __LINE__, MSG, __VA_ARGS__)
 
-struct llama_server_context {
+struct llama_server_context
+{
     bool stream = false;
     bool has_next_token = false;
     std::string generated_text;
+    std::vector<completion_token_output> generated_token_probs;
 
+    size_t num_prompt_tokens = 0;
     size_t num_tokens_predicted = 0;
     size_t n_past = 0;
     size_t n_remain = 0;
@@ -115,8 +191,8 @@ struct llama_server_context {
     std::vector<llama_token> embd;
     std::vector<llama_token> last_n_tokens;
 
-    llama_model * model = nullptr;
-    llama_context * ctx = nullptr;
+    llama_model *model = nullptr;
+    llama_context *ctx = nullptr;
     gpt_params params;
 
     bool truncated = false;
@@ -126,22 +202,35 @@ struct llama_server_context {
     std::string stopping_word;
     int32_t multibyte_pending = 0;
 
-    ~llama_server_context() {
-        if (ctx) {
+    std::mutex mutex;
+
+    std::unique_lock<std::mutex> lock()
+    {
+        return std::unique_lock<std::mutex>(mutex);
+    }
+
+    ~llama_server_context()
+    {
+        if (ctx)
+        {
             llama_free(ctx);
             ctx = nullptr;
         }
-        if (model) {
+        if (model)
+        {
             llama_free_model(model);
             model = nullptr;
         }
     }
 
-    void rewind() {
+    void rewind()
+    {
         params.antiprompt.clear();
+        num_prompt_tokens = 0;
         num_tokens_predicted = 0;
         generated_text = "";
         generated_text.reserve(params.n_ctx);
+        generated_token_probs.clear();
         truncated = false;
         stopped_eos = false;
         stopped_word = false;
@@ -153,11 +242,13 @@ struct llama_server_context {
         n_past = 0;
     }
 
-    bool loadModel(const gpt_params & params_) {
+    bool loadModel(const gpt_params &params_)
+    {
         params = params_;
         std::tie(model, ctx) = llama_init_from_gpt_params(params);
-        if (model == nullptr) {
-            LOG_ERROR("unable to load model", { { "model", params_.model } });
+        if (model == nullptr)
+        {
+            LOG_ERROR("unable to load model", {{"model", params_.model}});
             return false;
         }
 
@@ -166,34 +257,40 @@ struct llama_server_context {
         return true;
     }
 
-    void loadPrompt() {
+    void loadPrompt()
+    {
         params.prompt.insert(0, 1, ' '); // always add a first space
         std::vector<llama_token> prompt_tokens = ::llama_tokenize(ctx, params.prompt, true);
+        num_prompt_tokens = prompt_tokens.size();
 
-        if (params.n_keep < 0) {
-            params.n_keep = (int)prompt_tokens.size();
+        if (params.n_keep < 0)
+        {
+            params.n_keep = (int)num_prompt_tokens;
         }
         params.n_keep = std::min(params.n_ctx - 4, params.n_keep);
 
         // if input prompt is too big, truncate like normal
-        if (prompt_tokens.size() >= (size_t)params.n_ctx) {
+        if (num_prompt_tokens >= (size_t)params.n_ctx)
+        {
             const int n_left = (params.n_ctx - params.n_keep) / 2;
             std::vector<llama_token> new_tokens(prompt_tokens.begin(), prompt_tokens.begin() + params.n_keep);
-            const int erased_blocks = (prompt_tokens.size() - params.n_keep - n_left - 1) / n_left;
+            const int erased_blocks = (num_prompt_tokens - params.n_keep - n_left - 1) / n_left;
             new_tokens.insert(new_tokens.end(), prompt_tokens.begin() + params.n_keep + erased_blocks * n_left, prompt_tokens.end());
             std::copy(prompt_tokens.end() - params.n_ctx, prompt_tokens.end(), last_n_tokens.begin());
 
             LOG_VERBOSE("input truncated", {
-                { "n_ctx", params.n_ctx },
-                { "n_keep", params.n_keep },
-                { "n_left", n_left },
-                { "new_tokens", tokens_to_str(ctx, new_tokens.cbegin(), new_tokens.cend()) },
-            });
+                                               {"n_ctx", params.n_ctx},
+                                               {"n_keep", params.n_keep},
+                                               {"n_left", n_left},
+                                               {"new_tokens", tokens_to_str(ctx, new_tokens.cbegin(), new_tokens.cend())},
+                                           });
 
             truncated = true;
             prompt_tokens = new_tokens;
-        } else {
-            const size_t ps = prompt_tokens.size();
+        }
+        else
+        {
+            const size_t ps = num_prompt_tokens;
             std::fill(last_n_tokens.begin(), last_n_tokens.end() - ps, 0);
             std::copy(prompt_tokens.begin(), prompt_tokens.end(), last_n_tokens.end() - ps);
         }
@@ -201,30 +298,35 @@ struct llama_server_context {
         // compare the evaluated prompt with the new prompt
         n_past = common_part(embd, prompt_tokens);
         embd = prompt_tokens;
-        if (n_past == prompt_tokens.size()) {
+        if (n_past == num_prompt_tokens)
+        {
             // we have to evaluate at least 1 token to generate logits.
             n_past--;
         }
 
         LOG_VERBOSE("prompt ingested", {
-            { "n_past", n_past },
-            { "cached", tokens_to_str(ctx, embd.cbegin(), embd.cbegin() + n_past) },
-            { "to_eval", tokens_to_str(ctx, embd.cbegin() + n_past, embd.cend()) },
-        });
+                                           {"n_past", n_past},
+                                           {"cached", tokens_to_str(ctx, embd.cbegin(), embd.cbegin() + n_past)},
+                                           {"to_eval", tokens_to_str(ctx, embd.cbegin() + n_past, embd.cend())},
+                                       });
 
         has_next_token = true;
     }
 
-    void beginCompletion() {
+    void beginCompletion()
+    {
         // number of tokens to keep when resetting context
         n_remain = params.n_predict;
         llama_set_rng_seed(ctx, params.seed);
     }
 
-    llama_token nextToken() {
-        llama_token result = -1;
+    completion_token_output nextToken()
+    {
+        completion_token_output result;
+        result.tok = -1;
 
-        if (embd.size() >= (size_t)params.n_ctx) {
+        if (embd.size() >= (size_t)params.n_ctx)
+        {
             // Reset context
             const int n_left = (params.n_ctx - params.n_keep) / 2;
 
@@ -234,34 +336,39 @@ struct llama_server_context {
             n_past = params.n_keep;
             truncated = true;
             LOG_VERBOSE("input truncated", {
-                { "n_ctx", params.n_ctx },
-                { "n_keep", params.n_keep },
-                { "n_left", n_left },
-                { "new_tokens", tokens_to_str(ctx, new_tokens.cbegin(), new_tokens.cend()) },
-            });
+                                               {"n_ctx", params.n_ctx},
+                                               {"n_keep", params.n_keep},
+                                               {"n_left", n_left},
+                                               {"new_tokens", tokens_to_str(ctx, new_tokens.cbegin(), new_tokens.cend())},
+                                           });
         }
 
-        while (n_past < embd.size()) {
+        while (n_past < embd.size())
+        {
             int n_eval = (int)embd.size() - n_past;
-            if (n_eval > params.n_batch) {
+            if (n_eval > params.n_batch)
+            {
                 n_eval = params.n_batch;
             }
-            if (llama_eval(ctx, &embd[n_past], n_eval, n_past, params.n_threads)) {
+            if (llama_eval(ctx, &embd[n_past], n_eval, n_past, params.n_threads))
+            {
                 LOG_ERROR("failed to eval", {
-                    { "n_eval", n_eval },
-                    { "n_past", n_past },
-                    { "n_threads", params.n_threads },
-                    { "embd", tokens_to_str(ctx, embd.cbegin() + n_past, embd.cend()) },
-                });
+                                                {"n_eval", n_eval},
+                                                {"n_past", n_past},
+                                                {"n_threads", params.n_threads},
+                                                {"embd", tokens_to_str(ctx, embd.cbegin() + n_past, embd.cend())},
+                                            });
                 has_next_token = false;
                 return result;
             }
             n_past += n_eval;
         }
 
-        if (params.n_predict == 0) {
+        if (params.n_predict == 0)
+        {
             has_next_token = false;
-            return llama_token_eos();
+            result.tok = llama_token_eos();
+            return result;
         }
 
         // out of user input, sample next token
@@ -278,74 +385,95 @@ struct llama_server_context {
         const float mirostat_tau = params.mirostat_tau;
         const float mirostat_eta = params.mirostat_eta;
         const bool penalize_nl = params.penalize_nl;
-        llama_token id = 0;
+        const int32_t n_probs = params.n_probs;
 
         {
-            auto * logits = llama_get_logits(ctx);
+            auto *logits = llama_get_logits(ctx);
             auto n_vocab = llama_n_vocab(ctx);
 
             // Apply params.logit_bias map
-            for (const auto & it : params.logit_bias) {
+            for (const auto &it : params.logit_bias)
+            {
                 logits[it.first] += it.second;
             }
 
             std::vector<llama_token_data> candidates;
             candidates.reserve(n_vocab);
-            for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
-                candidates.emplace_back(llama_token_data{ token_id, logits[token_id], 0.0f });
+            for (llama_token token_id = 0; token_id < n_vocab; token_id++)
+            {
+                candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
             }
 
-            llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
+            llama_token_data_array candidates_p = {candidates.data(), candidates.size(), false};
 
             // Apply penalties
             float nl_logit = logits[llama_token_nl()];
             auto last_n_repeat = std::min(std::min((int)last_n_tokens.size(), repeat_last_n), params.n_ctx);
             llama_sample_repetition_penalty(ctx, &candidates_p,
-                last_n_tokens.data() + last_n_tokens.size() - last_n_repeat,
-                last_n_repeat, repeat_penalty);
+                                            last_n_tokens.data() + last_n_tokens.size() - last_n_repeat,
+                                            last_n_repeat, repeat_penalty);
             llama_sample_frequency_and_presence_penalties(ctx, &candidates_p,
-                last_n_tokens.data() + last_n_tokens.size() - last_n_repeat,
-                last_n_repeat, alpha_frequency, alpha_presence);
-            if (!penalize_nl) {
+                                                          last_n_tokens.data() + last_n_tokens.size() - last_n_repeat,
+                                                          last_n_repeat, alpha_frequency, alpha_presence);
+            if (!penalize_nl)
+            {
                 logits[llama_token_nl()] = nl_logit;
             }
 
-            if (temp <= 0) {
+            if (temp <= 0)
+            {
                 // Greedy sampling
-                id = llama_sample_token_greedy(ctx, &candidates_p);
-            } else {
-                if (mirostat == 1) {
+                result.tok = llama_sample_token_greedy(ctx, &candidates_p);
+                if (n_probs > 0)
+                {
+                    llama_sample_softmax(ctx, &candidates_p);
+                }
+            }
+            else
+            {
+                if (mirostat == 1)
+                {
                     static float mirostat_mu = 2.0f * mirostat_tau;
                     const int mirostat_m = 100;
                     llama_sample_temperature(ctx, &candidates_p, temp);
-                    id = llama_sample_token_mirostat(ctx, &candidates_p, mirostat_tau, mirostat_eta, mirostat_m, &mirostat_mu);
-                } else if (mirostat == 2) {
+                    result.tok = llama_sample_token_mirostat(ctx, &candidates_p, mirostat_tau, mirostat_eta, mirostat_m, &mirostat_mu);
+                }
+                else if (mirostat == 2)
+                {
                     static float mirostat_mu = 2.0f * mirostat_tau;
                     llama_sample_temperature(ctx, &candidates_p, temp);
-                    id = llama_sample_token_mirostat_v2(ctx, &candidates_p, mirostat_tau, mirostat_eta, &mirostat_mu);
-                } else {
+                    result.tok = llama_sample_token_mirostat_v2(ctx, &candidates_p, mirostat_tau, mirostat_eta, &mirostat_mu);
+                }
+                else
+                {
                     // Temperature sampling
-                    llama_sample_top_k(ctx, &candidates_p, top_k, 1);
-                    llama_sample_tail_free(ctx, &candidates_p, tfs_z, 1);
-                    llama_sample_typical(ctx, &candidates_p, typical_p, 1);
-                    llama_sample_top_p(ctx, &candidates_p, top_p, 1);
+                    size_t min_keep = std::max(1, n_probs);
+                    llama_sample_top_k(ctx, &candidates_p, top_k, min_keep);
+                    llama_sample_tail_free(ctx, &candidates_p, tfs_z, min_keep);
+                    llama_sample_typical(ctx, &candidates_p, typical_p, min_keep);
+                    llama_sample_top_p(ctx, &candidates_p, top_p, min_keep);
                     llama_sample_temperature(ctx, &candidates_p, temp);
-                    id = llama_sample_token(ctx, &candidates_p);
+                    result.tok = llama_sample_token(ctx, &candidates_p);
                 }
             }
+
+            for (size_t i = 0; i < std::min(candidates_p.size, (size_t)n_probs); ++i)
+            {
+                result.probs.push_back({candidates_p.data[i].id, candidates_p.data[i].p});
+            }
             last_n_tokens.erase(last_n_tokens.begin());
-            last_n_tokens.push_back(id);
+            last_n_tokens.push_back(result.tok);
             num_tokens_predicted++;
         }
 
         // add it to the context
-        embd.push_back(id);
-        result = id;
+        embd.push_back(result.tok);
         // decrement remaining sampling budget
         --n_remain;
 
-        if (!embd.empty() && embd.back() == llama_token_eos()) {
-            //stopping_word = llama_token_to_str(ctx, embd.back());
+        if (!embd.empty() && embd.back() == llama_token_eos())
+        {
+            // stopping_word = llama_token_to_str(ctx, embd.back());
             has_next_token = false;
             stopped_eos = true;
             LOG_VERBOSE("eos token found", {});
@@ -356,22 +484,28 @@ struct llama_server_context {
         return result;
     }
 
-    size_t findStoppingStrings(const std::string & text, const size_t last_token_size,
-                               const stop_type type) {
+    size_t findStoppingStrings(const std::string &text, const size_t last_token_size,
+                               const stop_type type)
+    {
         size_t stop_pos = std::string::npos;
-        for (const std::string & word : params.antiprompt) {
+        for (const std::string &word : params.antiprompt)
+        {
             size_t pos;
-            if (type == STOP_FULL) {
+            if (type == STOP_FULL)
+            {
                 const size_t tmp = word.size() + last_token_size;
                 const size_t from_pos = text.size() > tmp ? text.size() - tmp : 0;
                 pos = text.find(word, from_pos);
             }
-            else {
+            else
+            {
                 pos = find_partial_stop_string(word, text);
             }
             if (pos != std::string::npos &&
-                (stop_pos == std::string::npos || pos < stop_pos)) {
-                if (type == STOP_FULL) {
+                (stop_pos == std::string::npos || pos < stop_pos))
+            {
+                if (type == STOP_FULL)
+                {
                     stopping_word = word;
                     stopped_word = true;
                     has_next_token = false;
@@ -382,70 +516,91 @@ struct llama_server_context {
         return stop_pos;
     }
 
-    std::string doCompletion() {
-        const llama_token token = nextToken();
+    completion_token_output doCompletion()
+    {
+        const completion_token_output token_with_probs = nextToken();
 
-        const std::string token_text = token == -1 ? "" : llama_token_to_str(ctx, token);
+        const std::string token_text = token_with_probs.tok == -1 ? "" : llama_token_to_str(ctx, token_with_probs.tok);
         generated_text += token_text;
 
-        if (multibyte_pending > 0) {
+        if (params.n_probs > 0)
+        {
+            generated_token_probs.push_back(token_with_probs);
+        }
+
+        if (multibyte_pending > 0)
+        {
             multibyte_pending -= token_text.size();
-        } else if (token_text.size() == 1) {
+        }
+        else if (token_text.size() == 1)
+        {
             const char c = token_text[0];
             // 2-byte characters: 110xxxxx 10xxxxxx
-            if ((c & 0xE0) == 0xC0) {
+            if ((c & 0xE0) == 0xC0)
+            {
                 multibyte_pending = 1;
-            // 3-byte characters: 1110xxxx 10xxxxxx 10xxxxxx
-            } else if ((c & 0xF0) == 0xE0) {
+                // 3-byte characters: 1110xxxx 10xxxxxx 10xxxxxx
+            }
+            else if ((c & 0xF0) == 0xE0)
+            {
                 multibyte_pending = 2;
-            // 4-byte characters: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
-            } else if ((c & 0xF8) == 0xF0) {
+                // 4-byte characters: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
+            }
+            else if ((c & 0xF8) == 0xF0)
+            {
                 multibyte_pending = 3;
-            } else {
+            }
+            else
+            {
                 multibyte_pending = 0;
             }
         }
 
-        if (multibyte_pending > 0 && !has_next_token) {
+        if (multibyte_pending > 0 && !has_next_token)
+        {
             has_next_token = true;
             n_remain++;
         }
 
-        if (!has_next_token && n_remain == 0) {
+        if (!has_next_token && n_remain == 0)
+        {
             stopped_limit = true;
         }
 
         LOG_VERBOSE("next token", {
-            { "token", token },
-            { "token_text", llama_token_to_str(ctx, token) },
-            { "has_next_token", has_next_token },
-            { "n_remain", n_remain },
-            { "num_tokens_predicted", num_tokens_predicted },
-            { "stopped_eos", stopped_eos },
-            { "stopped_word", stopped_word },
-            { "stopped_limit", stopped_limit },
-            { "stopping_word", stopping_word },
-        });
+                                      {"token", token_with_probs.tok},
+                                      {"token_text", tokens_to_output_formatted_string(ctx, token_with_probs.tok)},
+                                      {"has_next_token", has_next_token},
+                                      {"n_remain", n_remain},
+                                      {"num_tokens_predicted", num_tokens_predicted},
+                                      {"stopped_eos", stopped_eos},
+                                      {"stopped_word", stopped_word},
+                                      {"stopped_limit", stopped_limit},
+                                      {"stopping_word", stopping_word},
+                                  });
 
-        return token_text;
+        return token_with_probs;
     }
 
-    std::vector<float> getEmbedding() {
+    std::vector<float> getEmbedding()
+    {
         static const int n_embd = llama_n_embd(ctx);
-        if (!params.embedding) {
+        if (!params.embedding)
+        {
             LOG_WARNING("embedding disabled", {
-                { "params.embedding", params.embedding },
-            });
+                                                  {"params.embedding", params.embedding},
+                                              });
             return std::vector<float>(n_embd, 0.0f);
         }
-        const float * data = llama_get_embeddings(ctx);
+        const float *data = llama_get_embeddings(ctx);
         std::vector<float> embedding(data, data + n_embd);
         return embedding;
     }
 };
 
-static void server_print_usage(const char * argv0, const gpt_params & params,
-                               const server_params & sparams) {
+static void server_print_usage(const char *argv0, const gpt_params &params,
+                               const server_params &sparams)
+{
     fprintf(stderr, "usage: %s [options]\n", argv0);
     fprintf(stderr, "\n");
     fprintf(stderr, "options:\n");
@@ -453,13 +608,17 @@ static void server_print_usage(const char * argv0, const gpt_params & params,
     fprintf(stderr, "  -v, --verbose         verbose output (default: %s)\n", server_verbose ? "enabled" : "disabled");
     fprintf(stderr, "  -t N, --threads N     number of threads to use during computation (default: %d)\n", params.n_threads);
     fprintf(stderr, "  -c N, --ctx-size N    size of the prompt context (default: %d)\n", params.n_ctx);
+    fprintf(stderr, "  --rope-freq-base N    RoPE base frequency (default: %.1f)\n", params.rope_freq_base);
+    fprintf(stderr, "  --rope-freq-scale N   RoPE frequency scaling factor (default: %g)\n", params.rope_freq_scale);
     fprintf(stderr, "  -b N, --batch-size N  batch size for prompt processing (default: %d)\n", params.n_batch);
     fprintf(stderr, "  --memory-f32          use f32 instead of f16 for memory key+value (default: disabled)\n");
     fprintf(stderr, "                        not recommended: doubles context memory required and no measurable increase in quality\n");
-    if (llama_mlock_supported()) {
+    if (llama_mlock_supported())
+    {
         fprintf(stderr, "  --mlock               force system to keep model in RAM rather than swapping or compressing\n");
     }
-    if (llama_mmap_supported()) {
+    if (llama_mmap_supported())
+    {
         fprintf(stderr, "  --no-mmap             do not memory-map model (slower load but may reduce pageouts if not using mlock)\n");
     }
 #ifdef LLAMA_SUPPORTS_GPU_OFFLOAD
@@ -479,77 +638,135 @@ static void server_print_usage(const char * argv0, const gpt_params & params,
     fprintf(stderr, "  --lora-base FNAME     optional model to use as a base for the layers modified by the LoRA adapter\n");
     fprintf(stderr, "  --host                ip address to listen (default  (default: %s)\n", sparams.hostname.c_str());
     fprintf(stderr, "  --port PORT           port to listen (default  (default: %d)\n", sparams.port);
+    fprintf(stderr, "  --path PUBLIC_PATH    path from which to serve static files (default %s)\n", sparams.public_path.c_str());
     fprintf(stderr, "  -to N, --timeout N    server read/write timeout in seconds (default: %d)\n", sparams.read_timeout);
     fprintf(stderr, "  --embedding           enable embedding vector output (default: %s)\n", params.embedding ? "enabled" : "disabled");
     fprintf(stderr, "\n");
 }
 
-static void server_params_parse(int argc, char ** argv, server_params & sparams,
-                                gpt_params & params) {
+static void server_params_parse(int argc, char **argv, server_params &sparams,
+                                gpt_params &params)
+{
     gpt_params default_params;
     server_params default_sparams;
     std::string arg;
     bool invalid_param = false;
 
-    for (int i = 1; i < argc; i++) {
+    for (int i = 1; i < argc; i++)
+    {
         arg = argv[i];
-        if (arg == "--port") {
-            if (++i >= argc) {
+        if (arg == "--port")
+        {
+            if (++i >= argc)
+            {
                 invalid_param = true;
                 break;
             }
             sparams.port = std::stoi(argv[i]);
-        } else if (arg == "--host") {
-            if (++i >= argc) {
+        }
+        else if (arg == "--host")
+        {
+            if (++i >= argc)
+            {
                 invalid_param = true;
                 break;
             }
             sparams.hostname = argv[i];
-        } else if (arg == "--timeout" || arg == "-to") {
-            if (++i >= argc) {
+        }
+        else if (arg == "--path")
+        {
+            if (++i >= argc)
+            {
+                invalid_param = true;
+                break;
+            }
+            sparams.public_path = argv[i];
+        }
+        else if (arg == "--timeout" || arg == "-to")
+        {
+            if (++i >= argc)
+            {
                 invalid_param = true;
                 break;
             }
             sparams.read_timeout = std::stoi(argv[i]);
             sparams.write_timeout = std::stoi(argv[i]);
-        } else if (arg == "-m" || arg == "--model") {
-            if (++i >= argc) {
+        }
+        else if (arg == "-m" || arg == "--model")
+        {
+            if (++i >= argc)
+            {
                 invalid_param = true;
                 break;
             }
             params.model = argv[i];
-        } else if (arg == "-a" || arg == "--alias") {
-            if (++i >= argc) {
+        }
+        else if (arg == "-a" || arg == "--alias")
+        {
+            if (++i >= argc)
+            {
                 invalid_param = true;
                 break;
             }
             params.model_alias = argv[i];
-        } else if (arg == "-h" || arg == "--help") {
+        }
+        else if (arg == "-h" || arg == "--help")
+        {
             server_print_usage(argv[0], default_params, default_sparams);
             exit(0);
-        } else if (arg == "-c" || arg == "--ctx-size" || arg == "--ctx_size") {
-            if (++i >= argc) {
+        }
+        else if (arg == "-c" || arg == "--ctx-size" || arg == "--ctx_size")
+        {
+            if (++i >= argc)
+            {
                 invalid_param = true;
                 break;
             }
             params.n_ctx = std::stoi(argv[i]);
-        } else if (arg == "--memory-f32" || arg == "--memory_f32") {
-            params.memory_f16 = false;
-        } else if (arg == "--threads" || arg == "-t") {
+        }
+        else if (arg == "--rope-freq-base")
+        {
             if (++i >= argc) {
                 invalid_param = true;
                 break;
             }
-            params.n_threads = std::stoi(argv[i]);
-        } else if (arg == "-b" || arg == "--batch-size") {
+            params.rope_freq_base = std::stof(argv[i]);
+        }
+        else if (arg == "--rope-freq-scale")
+        {
             if (++i >= argc) {
                 invalid_param = true;
                 break;
             }
+            params.rope_freq_scale = std::stof(argv[i]);
+        }
+        else if (arg == "--memory-f32" || arg == "--memory_f32")
+        {
+            params.memory_f16 = false;
+        }
+        else if (arg == "--threads" || arg == "-t")
+        {
+            if (++i >= argc)
+            {
+                invalid_param = true;
+                break;
+            }
+            params.n_threads = std::stoi(argv[i]);
+        }
+        else if (arg == "-b" || arg == "--batch-size")
+        {
+            if (++i >= argc)
+            {
+                invalid_param = true;
+                break;
+            }
             params.n_batch = std::stoi(argv[i]);
             params.n_batch = std::min(512, params.n_batch);
-        } else if (arg == "--gpu-layers" || arg == "-ngl" || arg == "--n-gpu-layers") {
-            if (++i >= argc) {
+        }
+        else if (arg == "--gpu-layers" || arg == "-ngl" || arg == "--n-gpu-layers")
+        {
+            if (++i >= argc)
+            {
                 invalid_param = true;
                 break;
             }
@@ -557,11 +774,14 @@ static void server_params_parse(int argc, char ** argv, server_params & sparams,
             params.n_gpu_layers = std::stoi(argv[i]);
 #else
             LOG_WARNING("Not compiled with GPU offload support, --n-gpu-layers option will be ignored. "
-                        "See main README.md for information on enabling GPU BLAS support", { { "n_gpu_layers", params.n_gpu_layers } });
+                        "See main README.md for information on enabling GPU BLAS support",
+                        {{"n_gpu_layers", params.n_gpu_layers}});
 #endif
         }
-        else if (arg == "--tensor-split" || arg == "-ts") {
-            if (++i >= argc) {
+        else if (arg == "--tensor-split" || arg == "-ts")
+        {
+            if (++i >= argc)
+            {
                 invalid_param = true;
                 break;
             }
@@ -569,16 +789,19 @@ static void server_params_parse(int argc, char ** argv, server_params & sparams,
             std::string arg_next = argv[i];
 
             // split string by , and /
-            const std::regex regex{ R"([,/]+)" };
-            std::sregex_token_iterator it{ arg_next.begin(), arg_next.end(), regex, -1 };
-            std::vector<std::string> split_arg{ it, {} };
+            const std::regex regex{R"([,/]+)"};
+            std::sregex_token_iterator it{arg_next.begin(), arg_next.end(), regex, -1};
+            std::vector<std::string> split_arg{it, {}};
             GGML_ASSERT(split_arg.size() <= LLAMA_MAX_DEVICES);
 
-            for (size_t i_device = 0; i_device < LLAMA_MAX_DEVICES; ++i_device) {
-                if (i_device < split_arg.size()) {
+            for (size_t i_device = 0; i_device < LLAMA_MAX_DEVICES; ++i_device)
+            {
+                if (i_device < split_arg.size())
+                {
                     params.tensor_split[i_device] = std::stof(split_arg[i_device]);
                 }
-                else {
+                else
+                {
                     params.tensor_split[i_device] = 0.0f;
                 }
             }
@@ -594,8 +817,10 @@ static void server_params_parse(int argc, char ** argv, server_params & sparams,
             fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. It is not possible to set lower vram usage.\n");
 #endif // GGML_USE_CUBLAS
         }
-        else if (arg == "--main-gpu" || arg == "-mg") {
-            if (++i >= argc) {
+        else if (arg == "--main-gpu" || arg == "-mg")
+        {
+            if (++i >= argc)
+            {
                 invalid_param = true;
                 break;
             }
@@ -604,110 +829,173 @@ static void server_params_parse(int argc, char ** argv, server_params & sparams,
 #else
             LOG_WARNING("llama.cpp was compiled without cuBLAS. It is not possible to set a main GPU.", {});
 #endif
-        } else if (arg == "--lora") {
-            if (++i >= argc) {
+        }
+        else if (arg == "--lora")
+        {
+            if (++i >= argc)
+            {
                 invalid_param = true;
                 break;
             }
             params.lora_adapter = argv[i];
             params.use_mmap = false;
-        } else if (arg == "--lora-base") {
-            if (++i >= argc) {
+        }
+        else if (arg == "--lora-base")
+        {
+            if (++i >= argc)
+            {
                 invalid_param = true;
                 break;
             }
             params.lora_base = argv[i];
-        } else if (arg == "-v" || arg == "--verbose") {
+        }
+        else if (arg == "-v" || arg == "--verbose")
+        {
 #if SERVER_VERBOSE != 1
             LOG_WARNING("server.cpp is not built with verbose logging.", {});
 #else
             server_verbose = true;
 #endif
-        } else if (arg == "--mlock") {
+        }
+        else if (arg == "--mlock")
+        {
             params.use_mlock = true;
-        } else if (arg == "--no-mmap") {
+        }
+        else if (arg == "--no-mmap")
+        {
             params.use_mmap = false;
-        } else if (arg == "--embedding") {
+        }
+        else if (arg == "--embedding")
+        {
             params.embedding = true;
-        } else {
+        }
+        else
+        {
             fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
             server_print_usage(argv[0], default_params, default_sparams);
             exit(1);
         }
     }
 
-    if (invalid_param) {
+    if (invalid_param)
+    {
         fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str());
         server_print_usage(argv[0], default_params, default_sparams);
         exit(1);
     }
 }
 
-static json format_generation_settings(llama_server_context & llama) {
+static json format_generation_settings(llama_server_context &llama)
+{
     const auto eos_bias = llama.params.logit_bias.find(llama_token_eos());
     const bool ignore_eos = eos_bias != llama.params.logit_bias.end() &&
-        eos_bias->second < 0.0f && std::isinf(eos_bias->second);
+                            eos_bias->second < 0.0f && std::isinf(eos_bias->second);
 
-    return json {
-        { "seed", llama.params.seed },
-        { "temp", llama.params.temp },
-        { "top_k", llama.params.top_k },
-        { "top_p", llama.params.top_p },
-        { "tfs_z", llama.params.tfs_z },
-        { "typical_p", llama.params.typical_p },
-        { "repeat_last_n", llama.params.repeat_last_n },
-        { "repeat_penalty", llama.params.repeat_penalty },
-        { "presence_penalty", llama.params.presence_penalty },
-        { "frequency_penalty", llama.params.frequency_penalty },
-        { "mirostat", llama.params.mirostat },
-        { "mirostat_tau", llama.params.mirostat_tau },
-        { "mirostat_eta", llama.params.mirostat_eta },
-        { "penalize_nl", llama.params.penalize_nl },
-        { "stop", llama.params.antiprompt },
-        { "n_predict", llama.params.n_predict },
-        { "n_keep", llama.params.n_keep },
-        { "ignore_eos", ignore_eos },
-        { "stream", llama.stream },
-        { "logit_bias", llama.params.logit_bias },
+    return json{
+        {"n_ctx", llama.params.n_ctx},
+        {"model", llama.params.model_alias},
+        {"seed", llama.params.seed},
+        {"temp", llama.params.temp},
+        {"top_k", llama.params.top_k},
+        {"top_p", llama.params.top_p},
+        {"tfs_z", llama.params.tfs_z},
+        {"typical_p", llama.params.typical_p},
+        {"repeat_last_n", llama.params.repeat_last_n},
+        {"repeat_penalty", llama.params.repeat_penalty},
+        {"presence_penalty", llama.params.presence_penalty},
+        {"frequency_penalty", llama.params.frequency_penalty},
+        {"mirostat", llama.params.mirostat},
+        {"mirostat_tau", llama.params.mirostat_tau},
+        {"mirostat_eta", llama.params.mirostat_eta},
+        {"penalize_nl", llama.params.penalize_nl},
+        {"stop", llama.params.antiprompt},
+        {"n_predict", llama.params.n_predict},
+        {"n_keep", llama.params.n_keep},
+        {"ignore_eos", ignore_eos},
+        {"stream", llama.stream},
+        {"logit_bias", llama.params.logit_bias},
+        {"n_probs", llama.params.n_probs},
     };
 }
 
-static json format_embedding_response(llama_server_context & llama) {
-    return json {
-        { "embedding", llama.getEmbedding() },
+static json format_embedding_response(llama_server_context &llama)
+{
+    return json{
+        {"embedding", llama.getEmbedding()},
     };
 }
 
-static json format_final_response(llama_server_context & llama, const std::string & content) {
-    return json {
-        { "content", content },
-        { "stop", true },
-        { "model", llama.params.model_alias },
-        { "tokens_predicted", llama.num_tokens_predicted },
-        { "generation_settings", format_generation_settings(llama) },
-        { "prompt", llama.params.prompt },
-        { "truncated", llama.truncated },
-        { "stopped_eos", llama.stopped_eos },
-        { "stopped_word", llama.stopped_word },
-        { "stopped_limit", llama.stopped_limit },
-        { "stopping_word", llama.stopping_word },
+static json format_timings(llama_server_context &llama)
+{
+    const auto timings = llama_get_timings(llama.ctx);
+
+    assert(timings.n_eval == llama.num_tokens_predicted);
+
+    return json{
+        {"prompt_n", timings.n_eval},
+        {"prompt_ms", timings.t_p_eval_ms},
+        {"prompt_per_token_ms", timings.t_p_eval_ms / timings.n_p_eval},
+        {"prompt_per_second", 1e3 / timings.t_p_eval_ms * timings.n_p_eval},
+
+        {"predicted_n", timings.n_eval},
+        {"predicted_ms", timings.t_eval_ms},
+        {"predicted_per_token_ms", timings.t_eval_ms / timings.n_eval},
+        {"predicted_per_second", 1e3 / timings.t_eval_ms * timings.n_eval},
     };
 }
 
-static json format_partial_response(const std::string & content) {
-    return json {
-        { "content", content },
-        { "stop", false },
+static json format_final_response(llama_server_context &llama, const std::string &content, const std::vector<completion_token_output> &probs)
+{
+
+    json res = json{
+        {"content", content},
+        {"stop", true},
+        {"model", llama.params.model_alias},
+        {"tokens_predicted", llama.num_tokens_predicted},
+        {"tokens_evaluated", llama.num_prompt_tokens},
+        {"generation_settings", format_generation_settings(llama)},
+        {"prompt", llama.params.prompt},
+        {"truncated", llama.truncated},
+        {"stopped_eos", llama.stopped_eos},
+        {"stopped_word", llama.stopped_word},
+        {"stopped_limit", llama.stopped_limit},
+        {"stopping_word", llama.stopping_word},
+        {"tokens_cached", llama.n_past},
+        {"tokens_predicted", llama.num_tokens_predicted},
+        {"timings", format_timings(llama)},
     };
+
+    if (llama.params.n_probs > 0)
+    {
+        res["completion_probabilities"] = probs_vector_to_json(llama.ctx, probs);
+    }
+
+    return res;
 }
 
-static json format_tokenizer_response(const std::vector<llama_token> & tokens) {
-    return json {
-        { "tokens", tokens }
+static json format_partial_response(llama_server_context &llama, const std::string &content, const std::vector<completion_token_output> &probs)
+{
+    json res = json{
+        {"content", content},
+        {"stop", false},
     };
+
+    if (llama.params.n_probs > 0)
+    {
+        res["completion_probabilities"] = probs_vector_to_json(llama.ctx, probs);
+    }
+
+    return res;
 }
 
-static void parse_options_completion(const json & body, llama_server_context & llama) {
+static json format_tokenizer_response(const std::vector<llama_token> &tokens)
+{
+    return json{
+        {"tokens", tokens}};
+}
+
+static void parse_options_completion(const json &body, llama_server_context &llama)
+{
     gpt_params default_params;
 
     llama.stream = body.value("stream", false);
@@ -728,22 +1016,31 @@ static void parse_options_completion(const json & body, llama_server_context & l
     llama.params.n_keep = body.value("n_keep", default_params.n_keep);
     llama.params.seed = body.value("seed", default_params.seed);
     llama.params.prompt = body.value("prompt", default_params.prompt);
+    llama.params.n_probs = body.value("n_probs", default_params.n_probs);
 
     llama.params.logit_bias.clear();
-    if (body.value("ignore_eos", false)) {
+    if (body.value("ignore_eos", false))
+    {
         llama.params.logit_bias[llama_token_eos()] = -INFINITY;
     }
 
-    const auto & logit_bias = body.find("logit_bias");
-    if (logit_bias != body.end() && logit_bias->is_array()) {
+    const auto &logit_bias = body.find("logit_bias");
+    if (logit_bias != body.end() && logit_bias->is_array())
+    {
         const int n_vocab = llama_n_vocab(llama.ctx);
-        for (const auto & el : *logit_bias) {
-            if (el.is_array() && el.size() == 2 && el[0].is_number_integer()) {
+        for (const auto &el : *logit_bias)
+        {
+            if (el.is_array() && el.size() == 2 && el[0].is_number_integer())
+            {
                 llama_token tok = el[0].get<llama_token>();
-                if (tok >= 0 && tok < n_vocab) {
-                    if (el[1].is_number()) {
+                if (tok >= 0 && tok < n_vocab)
+                {
+                    if (el[1].is_number())
+                    {
                         llama.params.logit_bias[tok] = el[1].get<float>();
-                    } else if (el[1].is_boolean() && !el[1].get<bool>()) {
+                    }
+                    else if (el[1].is_boolean() && !el[1].get<bool>())
+                    {
                         llama.params.logit_bias[tok] = -INFINITY;
                     }
                 }
@@ -752,10 +1049,13 @@ static void parse_options_completion(const json & body, llama_server_context & l
     }
 
     llama.params.antiprompt.clear();
-    const auto & stop = body.find("stop");
-    if (stop != body.end() && stop->is_array()) {
-        for (const auto & word : *stop) {
-            if (!word.empty()) {
+    const auto &stop = body.find("stop");
+    if (stop != body.end() && stop->is_array())
+    {
+        for (const auto &word : *stop)
+        {
+            if (!word.empty())
+            {
                 llama.params.antiprompt.push_back(word);
             }
         }
@@ -764,18 +1064,25 @@ static void parse_options_completion(const json & body, llama_server_context & l
     LOG_VERBOSE("completion parameters parsed", format_generation_settings(llama));
 }
 
-static void log_server_request(const Request & req, const Response & res) {
+static void log_server_request(const Request &req, const Response &res)
+{
     LOG_INFO("request", {
-        { "remote_addr", req.remote_addr },
-        { "remote_port", req.remote_port },
-        { "status", res.status },
-        { "path", req.path },
-        { "request", req.body },
-        { "response", res.body },
-    });
+                            {"remote_addr", req.remote_addr},
+                            {"remote_port", req.remote_port},
+                            {"status", res.status},
+                            {"method", req.method},
+                            {"path", req.path},
+                            {"params", req.params},
+                        });
+
+    LOG_VERBOSE("request", {
+                               {"request", req.body},
+                               {"response", res.body},
+                           });
 }
 
-int main(int argc, char ** argv) {
+int main(int argc, char **argv)
+{
     // own arguments required by this example
     gpt_params params;
     server_params sparams;
@@ -785,40 +1092,57 @@ int main(int argc, char ** argv) {
 
     server_params_parse(argc, argv, sparams, params);
 
-    if (params.model_alias == "unknown") {
+    if (params.model_alias == "unknown")
+    {
         params.model_alias = params.model;
     }
 
-    llama_init_backend(params.numa);
+    llama_backend_init(params.numa);
 
-    LOG_INFO("build info", {
-        { "build", BUILD_NUMBER },
-        { "commit", BUILD_COMMIT }
-    });
+    LOG_INFO("build info", {{"build", BUILD_NUMBER},
+                            {"commit", BUILD_COMMIT}});
     LOG_INFO("system info", {
-        { "n_threads", params.n_threads },
-        { "total_threads", std::thread::hardware_concurrency() },
-        { "system_info", llama_print_system_info() },
-    });
+                                {"n_threads", params.n_threads},
+                                {"total_threads", std::thread::hardware_concurrency()},
+                                {"system_info", llama_print_system_info()},
+                            });
 
     // load the model
-    if (!llama.loadModel(params)) {
+    if (!llama.loadModel(params))
+    {
         return 1;
     }
 
     Server svr;
 
-    svr.set_default_headers({
-        { "Access-Control-Allow-Origin", "*" },
-        { "Access-Control-Allow-Headers", "content-type" }
-    });
+    svr.set_default_headers({{"Server", "llama.cpp"},
+                             {"Access-Control-Allow-Origin", "*"},
+                             {"Access-Control-Allow-Headers", "content-type"}});
 
-    svr.Get("/", [](const Request &, Response & res) {
-        res.set_content("<h1>llama.cpp server works</h1>", "text/html");
-    });
+    // this is only called if no index.html is found in the public --path
+    svr.Get("/", [](const Request &, Response &res)
+            {
+        res.set_content(reinterpret_cast<const char*>(&index_html), index_html_len, "text/html");
+        return false; });
+
+    // this is only called if no index.js is found in the public --path
+    svr.Get("/index.js", [](const Request &, Response &res)
+            {
+        res.set_content(reinterpret_cast<const char *>(&index_js), index_js_len, "text/javascript");
+        return false; });
+
+    // this is only called if no index.html is found in the public --path
+    svr.Get("/completion.js", [](const Request &, Response &res)
+            {
+        res.set_content(reinterpret_cast<const char*>(&completion_js), completion_js_len, "application/javascript");
+        return false; });
+
+    svr.Post("/completion", [&llama](const Request &req, Response &res)
+             {
+        auto lock = llama.lock();
 
-    svr.Post("/completion", [&llama](const Request & req, Response & res) {
         llama.rewind();
+
         llama_reset_timings(llama.ctx);
 
         parse_options_completion(json::parse(req.body), llama);
@@ -830,7 +1154,8 @@ int main(int argc, char ** argv) {
             size_t stop_pos = std::string::npos;
 
             while (llama.has_next_token) {
-                const std::string token_text = llama.doCompletion();
+                const completion_token_output token_with_probs = llama.doCompletion();
+                const std::string token_text = token_with_probs.tok == -1 ? "" : llama_token_to_str(llama.ctx, token_with_probs.tok);
 
                 stop_pos = llama.findStoppingStrings(llama.generated_text,
                     token_text.size(), STOP_FULL);
@@ -844,7 +1169,7 @@ int main(int argc, char ** argv) {
                     llama.generated_text.end());
             }
 
-            const json data = format_final_response(llama, llama.generated_text);
+            const json data = format_final_response(llama, llama.generated_text, llama.generated_token_probs);
 
             llama_print_timings(llama.ctx);
 
@@ -853,9 +1178,11 @@ int main(int argc, char ** argv) {
         } else {
             const auto chunked_content_provider = [&](size_t, DataSink & sink) {
                 size_t sent_count = 0;
+                size_t sent_token_probs_index = 0;
 
                 while (llama.has_next_token) {
-                    const std::string token_text = llama.doCompletion();
+                    const completion_token_output token_with_probs = llama.doCompletion();
+                    const std::string token_text = token_with_probs.tok == -1 ? "" : llama_token_to_str(llama.ctx, token_with_probs.tok);
                     if (llama.multibyte_pending > 0) {
                         continue;
                     }
@@ -878,10 +1205,22 @@ int main(int argc, char ** argv) {
                     const std::string to_send = llama.generated_text.substr(pos, stop_pos);
                     sent_count += to_send.size();
 
+                    std::vector<completion_token_output> probs_output = {};
+
+                    if (llama.params.n_probs > 0) {
+                        const std::vector<llama_token> to_send_toks = llama_tokenize(llama.ctx, to_send, false);
+                        size_t probs_pos = std::min(sent_token_probs_index, llama.generated_token_probs.size());
+                        size_t probs_stop_pos = std::min(sent_token_probs_index + to_send_toks.size(), llama.generated_token_probs.size());
+                        if (probs_pos < probs_stop_pos) {
+                            probs_output = std::vector<completion_token_output>(llama.generated_token_probs.begin() + probs_pos, llama.generated_token_probs.begin() + probs_stop_pos);
+                        }
+                        sent_token_probs_index = probs_stop_pos;
+                    }
+
                     const json data = llama.has_next_token
-                                          ? format_partial_response(to_send)
+                                          ? format_partial_response(llama, to_send, probs_output)
                                           // Generation is done, send extra information.
-                                          : format_final_response(llama, to_send);
+                                          : format_final_response(llama, to_send, llama.generated_token_probs);
 
                     const std::string str =
                         "data: " +
@@ -904,22 +1243,30 @@ int main(int argc, char ** argv) {
                 return true;
             };
             res.set_chunked_content_provider("text/event-stream", chunked_content_provider);
-        }
-    });
+        } });
 
-    svr.Options(R"(/.*)", [](const Request &, Response & res) {
-        return res.set_content("", "application/json");
-    });
+    svr.Get("/model.json", [&llama](const Request &, Response &res)
+            {
+        const json data = format_generation_settings(llama);
+        return res.set_content(data.dump(), "application/json"); });
+
+    svr.Options(R"(/.*)", [](const Request &, Response &res)
+                { return res.set_content("", "application/json"); });
+
+    svr.Post("/tokenize", [&llama](const Request &req, Response &res)
+             {
+        auto lock = llama.lock();
 
-    svr.Post("/tokenize", [&llama](const Request & req, Response & res) {
         const json body = json::parse(req.body);
         const std::string content = body.value("content", "");
         const std::vector<llama_token> tokens = llama_tokenize(llama.ctx, content, false);
         const json data = format_tokenizer_response(tokens);
-        return res.set_content(data.dump(), "application/json");
-    });
+        return res.set_content(data.dump(), "application/json"); });
+
+    svr.Post("/embedding", [&llama](const Request &req, Response &res)
+             {
+        auto lock = llama.lock();
 
-    svr.Post("/embedding", [&llama](const Request & req, Response & res) {
         const json body = json::parse(req.body);
 
         llama.rewind();
@@ -931,12 +1278,12 @@ int main(int argc, char ** argv) {
         llama.doCompletion();
 
         const json data = format_embedding_response(llama);
-        return res.set_content(data.dump(), "application/json");
-    });
+        return res.set_content(data.dump(), "application/json"); });
 
     svr.set_logger(log_server_request);
 
-    svr.set_exception_handler([](const Request &, Response & res, std::exception_ptr ep) {
+    svr.set_exception_handler([](const Request &, Response &res, std::exception_ptr ep)
+                              {
         const auto * fmt = "500 Internal Server Error\n%s";
         char buf[BUFSIZ];
         try {
@@ -947,29 +1294,40 @@ int main(int argc, char ** argv) {
             snprintf(buf, sizeof(buf), fmt, "Unknown Exception");
         }
         res.set_content(buf, "text/plain");
-        res.status = 500;
-    });
+        res.status = 500; });
+
+    svr.set_error_handler([](const Request &, Response &res)
+                          {
+        res.set_content("File Not Found", "text/plain");
+        res.status = 404; });
 
     // set timeouts and change hostname and port
     svr.set_read_timeout(sparams.read_timeout);
     svr.set_write_timeout(sparams.write_timeout);
 
-    if (!svr.bind_to_port(sparams.hostname, sparams.port)) {
-        LOG_ERROR("couldn't bind to server socket", {
-            { "hostname", sparams.hostname },
-            { "port", sparams.port },
-        });
+    if (!svr.bind_to_port(sparams.hostname, sparams.port))
+    {
+        fprintf(stderr, "\ncouldn't bind to server socket: hostname=%s port=%d\n\n", sparams.hostname.c_str(), sparams.port);
         return 1;
     }
 
+    // Set the base directory for serving static files
+    svr.set_base_dir(sparams.public_path);
+
+    // to make it ctrl+clickable:
+    fprintf(stdout, "\nllama server listening at http://%s:%d\n\n", sparams.hostname.c_str(), sparams.port);
+
     LOG_INFO("HTTP server listening", {
-        { "hostname", sparams.hostname },
-        { "port", sparams.port },
-    });
+                                          {"hostname", sparams.hostname},
+                                          {"port", sparams.port},
+                                      });
 
-    if (!svr.listen_after_bind()) {
+    if (!svr.listen_after_bind())
+    {
         return 1;
     }
 
+    llama_backend_free();
+
     return 0;
 }
diff --git a/examples/simple/simple.cpp b/examples/simple/simple.cpp
index 2d913cebb..aa2c4352d 100644
--- a/examples/simple/simple.cpp
+++ b/examples/simple/simple.cpp
@@ -66,7 +66,7 @@ int main(int argc, char ** argv)
     // Init LLM :
     //---------------------------------
 
-    llama_init_backend(params.numa);
+    llama_backend_init(params.numa);
 
     llama_model * model;
     llama_context * ctx;
@@ -173,6 +173,8 @@ int main(int argc, char ** argv)
     llama_free( ctx );
     llama_free_model( model );
 
+    llama_backend_free();
+
     return 0;
 }
 
diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp
index 05bfa8016..afbb4a777 100644
--- a/examples/train-text-from-scratch/train-text-from-scratch.cpp
+++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp
@@ -60,6 +60,17 @@ float frand_uniform(struct random_uniform_distribution * rnd) {
     return rnd->rd(rnd->gen);
 }
 
+void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) {
+    struct ggml_cplan plan = ggml_graph_plan(graph, n_threads);
+
+    if (plan.work_size > 0) {
+        buf.resize(plan.work_size);
+        plan.work_data = buf.data();
+    }
+
+    ggml_graph_compute(graph, &plan);
+}
+
 struct ggml_tensor * randomize_tensor_normal(struct ggml_tensor * tensor, struct random_normal_distribution * rnd) {
     float scale = 1.0f; // xavier
     switch (tensor->n_dims) {
@@ -1343,17 +1354,9 @@ struct ggml_tensor * expand(struct ggml_cgraph * g, struct ggml_tensor * t) {
         }
     }
 
-    if (t->src0) {
-        expand(g, t->src0);
-    }
-
-    if (t->src1) {
-        expand(g, t->src1);
-    }
-
-    for (int i = 0; i < GGML_MAX_OPT; ++i) {
-        if (t->opt[i]) {
-            expand(g, t->opt[i]);
+    for (int i = 0; i < GGML_MAX_SRC; ++i) {
+        if (t->src[i]) {
+            expand(g, t->src[i]);
         }
     }
 
@@ -1426,11 +1429,9 @@ struct ggml_tensor * forward_batch_wo_cache_flash_attn_train(
 
     gf->n_nodes = 0;
     gf->n_leafs = 0;
-    gf->work_size = 0;
     gf->perf_runs = 0;
     gf->perf_cycles = 0;
     gf->perf_time_us = 0;
-    gf->work = NULL;
 
     const auto & hparams = model->hparams;
     //const int n_ctx      = hparams.n_ctx;
@@ -2671,7 +2672,8 @@ struct train_params {
     const char * fn_checkpoint_out;
     const char * fn_model_out;
 
-    int seed;
+    uint32_t seed;
+
     int n_ctx;
     int n_embd;
     int n_mult;
@@ -3161,6 +3163,7 @@ int main(int argc, char ** argv) {
     printf("used_mem model+cache: %zu bytes\n", ggml_used_mem(model.ctx));
     // ggml_print_tensor_objects(model.ctx);
 
+    // TODO: use std::vector<uint8_t> intead of "new"
     size_t    compute_size = 1024ll*1024ll*1024ll*((size_t) params.mem_compute_gb);
     uint8_t * compute_addr = new uint8_t[compute_size];
 
@@ -3182,6 +3185,8 @@ int main(int argc, char ** argv) {
         GGML_ASSERT(train_samples[i]+n_tokens-1 < (int) train_tokens.size());
     }
 
+    std::vector<uint8_t> work_buffer;
+
     printf("%s: begin training\n", __func__);
 
     for (int ex = 0; ex < params.n_examples; ++ex) {
@@ -3216,9 +3221,6 @@ int main(int argc, char ** argv) {
         struct ggml_cgraph * gf = (struct ggml_cgraph *) gfbuf->data;
         struct ggml_cgraph * gb = (struct ggml_cgraph *) gbbuf->data;
 
-        // ggml_cgraph gf = {};
-        gf->n_threads = params.n_threads;
-        gb->n_threads = params.n_threads;
 
         get_example_targets_batch(lctx, train_samples.data(), train_samples.size(), train_tokens.data(), train_tokens.size(), ex,  tokens_input, target_logits, target_probs);
 
@@ -3247,7 +3249,7 @@ int main(int argc, char ** argv) {
             *gb = ggml_build_backward(ctx0, gf, true);
         }
 
-        ggml_graph_compute(ctx0, gf);
+        ggml_graph_compute_helper(work_buffer, gf, params.n_threads);
 
         size_t used_mem_before_opt = ggml_used_mem(ctx0);
 
@@ -3271,7 +3273,7 @@ int main(int argc, char ** argv) {
         model.train_samples += n_batch;
         model.train_tokens  += n_batch * n_tokens;
 
-        ggml_graph_compute(ctx0, gf);
+        ggml_graph_compute_helper(work_buffer, gf, params.n_threads);
 
         float error_after_opt = ggml_get_f32_1d(loss, 0);
 
@@ -3353,13 +3355,12 @@ int main(int argc, char ** argv) {
             struct ggml_context * ctx0 = ggml_init(cparams);
 
             ggml_cgraph gf = {};
-            gf.n_threads = params.n_threads;
 
             int n_past = 0;
             struct ggml_tensor * logits = forward(&model, &kv_self, ctx0, &gf, tokens_input, sample_ctx, n_past);
 
             ggml_build_forward_expand(&gf, logits);
-            ggml_graph_compute(ctx0, &gf);
+            ggml_graph_compute_helper(work_buffer, &gf, params.n_threads);
 
             //struct ggml_tensor * best_samples = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, sample_ctx);
             //struct ggml_tensor * probs        = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_vocab, sample_ctx);
@@ -3385,6 +3386,7 @@ int main(int argc, char ** argv) {
     delete[] compute_addr;
     delete[] compute_buf_0;
     delete[] compute_buf_1;
+
     llama_free(lctx);
     llama_free_model(lmodel);
     ggml_free(model.ctx);
diff --git a/flake.nix b/flake.nix
index cebb47b94..dda7ee7b7 100644
--- a/flake.nix
+++ b/flake.nix
@@ -43,6 +43,8 @@
             "-DLLAMA_METAL=ON"
           ]);
           installPhase = ''
+            runHook preInstall
+
             mkdir -p $out/bin
             mv bin/* $out/bin/
             mv $out/bin/main $out/bin/llama
@@ -51,6 +53,8 @@
             echo "#!${llama-python}/bin/python" > $out/bin/convert.py
             cat ${./convert.py} >> $out/bin/convert.py
             chmod +x $out/bin/convert.py
+
+            runHook postInstall
           '';
           meta.mainProgram = "llama";
         };
diff --git a/ggml-cuda.cu b/ggml-cuda.cu
index 4e0d3dbde..d3054a7fa 100644
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@@ -13,6 +13,8 @@
 #include "ggml-cuda.h"
 #include "ggml.h"
 
+#define MIN_CC_DP4A 610 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products
+
 #if defined(_MSC_VER)
 #pragma warning(disable: 4244 4267) // possible loss of data
 #endif
@@ -59,8 +61,8 @@ typedef float2 dfloat2;
 #endif //GGML_CUDA_DMMV_F16
 
 typedef void (*dequantize_kernel_t)(const void * vx, const int ib, const int iqs, dfloat2 & v);
-typedef void (*to_fp32_cuda_t)(const void * x, float * y, int k, cudaStream_t stream);
-typedef void (*dot_kernel_k_t)(const void * vx, const int ib, const int iqs, const float * y, float & v);
+typedef void (*to_fp32_cuda_t)(const void * __restrict__ x, float * __restrict__ y, int k, cudaStream_t stream);
+typedef void (*dot_kernel_k_t)(const void * __restrict__ vx, const int ib, const int iqs, const float * __restrict__ y, float & v);
 typedef void (*cpy_kernel_t)(const char * cx, char * cdst);
 typedef void (*ggml_cuda_func_t)(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst);
 typedef void (*ggml_cuda_op_t)(
@@ -70,9 +72,11 @@ typedef void (*ggml_cuda_op_t)(
 
 // QK = number of values after dequantization
 // QR = QK / number of values before dequantization
+// QI = number of 32 bit integers before dequantization
 
 #define QK4_0 32
 #define QR4_0 2
+#define QI4_0 (QK4_0 / (4 * QR4_0))
 typedef struct {
     half    d;              // delta
     uint8_t qs[QK4_0 / 2];  // nibbles / quants
@@ -81,6 +85,7 @@ static_assert(sizeof(block_q4_0) == sizeof(ggml_fp16_t) + QK4_0 / 2, "wrong q4_0
 
 #define QK4_1 32
 #define QR4_1 2
+#define QI4_1 (QK4_1 / (4 * QR4_1))
 typedef struct {
     half    d;              // delta
     half    m;              // min
@@ -90,6 +95,7 @@ static_assert(sizeof(block_q4_1) == sizeof(ggml_fp16_t) * 2 + QK4_1 / 2, "wrong
 
 #define QK5_0 32
 #define QR5_0 2
+#define QI5_0 (QK5_0 / (4 * QR5_0))
 typedef struct {
     half d;                 // delta
     uint8_t qh[4];          // 5-th bit of quants
@@ -99,6 +105,7 @@ static_assert(sizeof(block_q5_0) == sizeof(ggml_fp16_t) + sizeof(uint32_t) + QK5
 
 #define QK5_1 32
 #define QR5_1 2
+#define QI5_1 (QK5_1 / (4 * QR5_1))
 typedef struct {
     half d;                 // delta
     half m;                 // min
@@ -109,12 +116,25 @@ static_assert(sizeof(block_q5_1) == 2 * sizeof(ggml_fp16_t) + sizeof(uint32_t) +
 
 #define QK8_0 32
 #define QR8_0 1
+#define QI8_0 (QK8_0 / (4 * QR8_0))
 typedef struct {
     half    d;              // delta
     int8_t  qs[QK8_0];      // quants
 } block_q8_0;
 static_assert(sizeof(block_q8_0) == sizeof(ggml_fp16_t) + QK8_0, "wrong q8_0 block size/padding");
 
+#define QK8_1 32
+#define QR8_1 1
+#define QI8_1 (QK8_1 / (4 * QR8_1))
+typedef struct {
+    half    d;              // delta
+    half    s;              // unquantized sum
+    int8_t  qs[QK8_0];      // quants
+} block_q8_1;
+static_assert(sizeof(block_q8_1) == 2*sizeof(ggml_fp16_t) + QK8_0, "wrong q8_1 block size/padding");
+
+typedef float (*vec_dot_q_cuda_t)(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs);
+
 //================================= k-quants
 
 #ifdef GGML_QKK_64
@@ -125,6 +145,8 @@ static_assert(sizeof(block_q8_0) == sizeof(ggml_fp16_t) + QK8_0, "wrong q8_0 blo
 #define K_SCALE_SIZE 12
 #endif
 
+#define QR2_K 4
+#define QI2_K (QK_K / (4*QR2_K))
 typedef struct {
     uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
     uint8_t qs[QK_K/4];      // quants
@@ -133,6 +155,8 @@ typedef struct {
 } block_q2_K;
 static_assert(sizeof(block_q2_K) == 2*sizeof(ggml_fp16_t) + QK_K/16 + QK_K/4, "wrong q2_K block size/padding");
 
+#define QR3_K 4
+#define QI3_K (QK_K / (4*QR3_K))
 typedef struct {
     uint8_t hmask[QK_K/8];     // quants - high bit
     uint8_t qs[QK_K/4];        // quants - low 2 bits
@@ -145,6 +169,8 @@ typedef struct {
 } block_q3_K;
 //static_assert(sizeof(block_q3_K) == sizeof(ggml_fp16_t) + QK_K / 4 + QK_K / 8 + K_SCALE_SIZE, "wrong q3_K block size/padding");
 
+#define QR4_K 2
+#define QI4_K (QK_K / (4*QR4_K))
 #ifdef GGML_QKK_64
 typedef struct {
     half    d[2];              // super-block scales/mins
@@ -162,6 +188,8 @@ typedef struct {
 static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_fp16_t) + 3*QK_K/64 + QK_K/2, "wrong q4_K block size/padding");
 #endif
 
+#define QR5_K 2
+#define QI5_K (QK_K / (4*QR5_K))
 #ifdef GGML_QKK_64
 typedef struct {
     half d;                  // super-block scale
@@ -181,6 +209,8 @@ typedef struct {
 static_assert(sizeof(block_q5_K) == 2*sizeof(ggml_fp16_t) + K_SCALE_SIZE + QK_K/2 + QK_K/8, "wrong q5_K block size/padding");
 #endif
 
+#define QR6_K 2
+#define QI6_K (QK_K / (4*QR6_K))
 typedef struct {
     uint8_t ql[QK_K/2];   // quants, lower 4 bits
     uint8_t qh[QK_K/4];   // quants, upper 2 bits
@@ -190,22 +220,25 @@ typedef struct {
 static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_K block size/padding");
 
 #define WARP_SIZE 32
+#define MATRIX_ROW_PADDING 256 // last row of quant. matrices is a multiple of this to avoid out-of-bounds memory accesses
 
 #define CUDA_ADD_BLOCK_SIZE 256
 #define CUDA_MUL_BLOCK_SIZE 256
+#define CUDA_GELU_BLOCK_SIZE 256
 #define CUDA_SILU_BLOCK_SIZE 256
 #define CUDA_CPY_BLOCK_SIZE 32
 #define CUDA_SCALE_BLOCK_SIZE 256
 #define CUDA_ROPE_BLOCK_SIZE 256
 #define CUDA_DIAG_MASK_INF_BLOCK_SIZE 32
+#define CUDA_QUANTIZE_BLOCK_SIZE 256
 #define CUDA_DEQUANTIZE_BLOCK_SIZE 256
 
 // dmmv = dequantize_mul_mat_vec
 #ifndef GGML_CUDA_DMMV_X
 #define GGML_CUDA_DMMV_X 32
 #endif
-#ifndef GGML_CUDA_DMMV_Y
-#define GGML_CUDA_DMMV_Y 1
+#ifndef GGML_CUDA_MMV_Y
+#define GGML_CUDA_MMV_Y 1
 #endif
 
 #ifndef K_QUANTS_PER_ITERATION
@@ -214,13 +247,18 @@ static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_
 static_assert(K_QUANTS_PER_ITERATION == 1 || K_QUANTS_PER_ITERATION == 2, "K_QUANTS_PER_ITERATION must be 1 or 2");
 #endif
 
-static __global__ void add_f32(const float * x, const float * y, float * dst, const int k) {
+struct ggml_tensor_extra_gpu {
+    void * data_device[GGML_CUDA_MAX_DEVICES]; // 1 pointer for each device for split tensors
+    cudaEvent_t events[GGML_CUDA_MAX_DEVICES]; // events for synchronizing multiple GPUs
+};
+
+static __global__ void add_f32(const float * x, const float * y, float * dst, const int kx, const int ky) {
     const int i = blockDim.x*blockIdx.x + threadIdx.x;
 
-    if (i >= k) {
+    if (i >= kx) {
         return;
     }
-    dst[i] = x[i] + y[i];
+    dst[i] = x[i] + y[i%ky];
 }
 
 static __global__ void add_f16_f32_f16(const half * x, const float * y, half * dst, const int k) {
@@ -241,6 +279,19 @@ static __global__ void mul_f32(const float * x, const float * y, float * dst, co
     dst[i] = x[i] * y[i%ky];
 }
 
+static __global__ void gelu_f32(const float * x, float * dst, const int k) {
+    const float GELU_COEF_A    = 0.044715f;
+    const float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f;
+    const int i = blockDim.x*blockIdx.x + threadIdx.x;
+
+    if (i >= k) {
+        return;
+    }
+
+    float xi = x[i];
+    dst[i] = 0.5f*xi*(1.0f + tanhf(SQRT_2_OVER_PI*xi*(1.0f + GELU_COEF_A*xi*xi)));
+}
+
 static __global__ void silu_f32(const float * x, float * dst, const int k) {
     const int i = blockDim.x*blockIdx.x + threadIdx.x;
 
@@ -250,32 +301,60 @@ static __global__ void silu_f32(const float * x, float * dst, const int k) {
     dst[i] = x[i] / (1.0f + expf(-x[i]));
 }
 
+static __global__ void norm_f32(const float * x, float * dst, const int ncols) {
+    const int row = blockIdx.x*blockDim.y + threadIdx.y;
+    const int tid = threadIdx.x;
+
+    const float eps = 1e-5f;
+
+    float mean = 0.0f;
+    float var = 0.0f;
+
+    for (int col = tid; col < ncols; col += WARP_SIZE) {
+        const float xi = x[row*ncols + col];
+        mean += xi;
+        var += xi * xi;
+    }
+
+    // sum up partial sums
+#pragma unroll
+    for (int mask = 16; mask > 0; mask >>= 1) {
+        mean += __shfl_xor_sync(0xffffffff, mean, mask, 32);
+        var += __shfl_xor_sync(0xffffffff, var, mask, 32);
+    }
+
+    mean /= ncols;
+    var = var / ncols - mean * mean;
+    const float inv_var = rsqrtf(var + eps);
+
+    for (int col = tid; col < ncols; col += WARP_SIZE) {
+        dst[row*ncols + col] = (x[row*ncols + col] - mean) * inv_var;
+    }
+}
+
 static __global__ void rms_norm_f32(const float * x, float * dst, const int ncols) {
     const int row = blockIdx.x*blockDim.y + threadIdx.y;
     const int tid = threadIdx.x;
 
-    const float eps = 1e-6;
+    const float eps = 1e-6f;
 
     float tmp = 0.0f; // partial sum for thread in warp
 
-    for (int i = 0; i < ncols; i += WARP_SIZE) {
-        const int col = i + tid;
+    for (int col = tid; col < ncols; col += WARP_SIZE) {
         const float xi = x[row*ncols + col];
         tmp += xi * xi;
     }
 
     // sum up partial sums
-    __syncthreads();
 #pragma unroll
     for (int mask = 16; mask > 0; mask >>= 1) {
         tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
     }
 
     const float mean = tmp / ncols;
-    const float scale = 1.0f / sqrtf(mean + eps);
+    const float scale = rsqrtf(mean + eps);
 
-    for (int i = 0; i < ncols; i += WARP_SIZE) {
-        const int col = i + tid;
+    for (int col = tid; col < ncols; col += WARP_SIZE) {
         dst[row*ncols + col] = scale * x[row*ncols + col];
     }
 }
@@ -384,7 +463,7 @@ static __device__ __forceinline__ void dequantize_q8_0(const void * vx, const in
 
 //================================== k-quants
 
-static __global__ void dequantize_block_q2_K(const void * vx, float * yy) {
+static __global__ void dequantize_block_q2_K(const void * __restrict__ vx, float * __restrict__ yy) {
 
     const int i   = blockIdx.x;
     const block_q2_K * x = (const block_q2_K *) vx;
@@ -417,7 +496,7 @@ static __global__ void dequantize_block_q2_K(const void * vx, float * yy) {
 
 }
 
-static __global__ void dequantize_block_q3_K(const void * vx, float * yy) {
+static __global__ void dequantize_block_q3_K(const void * __restrict__ vx, float * __restrict__ yy) {
 
     const int i = blockIdx.x;
     const block_q3_K * x = (const block_q3_K *) vx;
@@ -481,7 +560,7 @@ static inline __device__ void get_scale_min_k4(int j, const uint8_t * q, uint8_t
 }
 #endif
 
-static __global__ void dequantize_block_q4_K(const void * vx, float * yy) {
+static __global__ void dequantize_block_q4_K(const void * __restrict__ vx, float * __restrict__ yy) {
     const block_q4_K * x = (const block_q4_K *) vx;
 
     const int i = blockIdx.x;
@@ -521,7 +600,7 @@ static __global__ void dequantize_block_q4_K(const void * vx, float * yy) {
 #endif
 }
 
-static __global__ void dequantize_block_q5_K(const void * vx, float * yy) {
+static __global__ void dequantize_block_q5_K(const void * __restrict__ vx, float * __restrict__ yy) {
     const block_q5_K * x = (const block_q5_K *) vx;
 
     const int i = blockIdx.x;
@@ -567,7 +646,7 @@ static __global__ void dequantize_block_q5_K(const void * vx, float * yy) {
 #endif
 }
 
-static __global__ void dequantize_block_q6_K(const void * vx, float * yy) {
+static __global__ void dequantize_block_q6_K(const void * __restrict__ vx, float * __restrict__ yy) {
     const block_q6_K * x = (const block_q6_K *) vx;
 
     const int i = blockIdx.x;
@@ -611,7 +690,7 @@ static __global__ void dequantize_block_q6_K(const void * vx, float * yy) {
 #endif
 }
 
-static __global__ void dequantize_mul_mat_vec_q2_k(const void * vx, const float * yy, float * dst, const int ncols, int nrows) {
+static __global__ void dequantize_mul_mat_vec_q2_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) {
 
     static_assert(16%K_QUANTS_PER_ITERATION == 0, "16 must be divisible by K_QUANTS_PER_ITERATION");
 
@@ -709,7 +788,6 @@ static __global__ void dequantize_mul_mat_vec_q2_k(const void * vx, const float
 #endif
 
     // sum up partial sums and write back result
-    __syncthreads();
 #pragma unroll
     for (int mask = 16; mask > 0; mask >>= 1) {
         tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
@@ -720,7 +798,7 @@ static __global__ void dequantize_mul_mat_vec_q2_k(const void * vx, const float
     }
 }
 
-static __global__ void dequantize_mul_mat_vec_q3_k(const void * vx, const float * yy, float * dst, const int ncols, int nrows) {
+static __global__ void dequantize_mul_mat_vec_q3_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) {
 
     const int row = blockIdx.y*blockDim.y + threadIdx.y;
     if (row > nrows) return;
@@ -814,7 +892,6 @@ static __global__ void dequantize_mul_mat_vec_q3_k(const void * vx, const float
 #endif
 
     // sum up partial sums and write back result
-    __syncthreads();
 #pragma unroll
     for (int mask = 16; mask > 0; mask >>= 1) {
         tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
@@ -825,7 +902,7 @@ static __global__ void dequantize_mul_mat_vec_q3_k(const void * vx, const float
     }
 }
 
-static __global__ void dequantize_mul_mat_vec_q4_k(const void * vx, const float * yy, float * dst, const int ncols, int nrows) {
+static __global__ void dequantize_mul_mat_vec_q4_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) {
 
     const int row = blockIdx.y*blockDim.y + threadIdx.y;
     if (row > nrows) return;
@@ -918,7 +995,6 @@ static __global__ void dequantize_mul_mat_vec_q4_k(const void * vx, const float
 #endif
 
     // sum up partial sums and write back result
-    __syncthreads();
 #pragma unroll
     for (int mask = 16; mask > 0; mask >>= 1) {
         tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
@@ -929,7 +1005,7 @@ static __global__ void dequantize_mul_mat_vec_q4_k(const void * vx, const float
     }
 }
 
-static __global__ void dequantize_mul_mat_vec_q5_k(const void * vx, const float * yy, float * dst, const int ncols) {
+static __global__ void dequantize_mul_mat_vec_q5_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols) {
 
     const int row = blockIdx.x;
     const int num_blocks_per_row = ncols / QK_K;
@@ -1023,7 +1099,6 @@ static __global__ void dequantize_mul_mat_vec_q5_k(const void * vx, const float
 #endif
 
     // sum up partial sums and write back result
-    __syncthreads();
 #pragma unroll
     for (int mask = 16; mask > 0; mask >>= 1) {
         tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
@@ -1034,7 +1109,7 @@ static __global__ void dequantize_mul_mat_vec_q5_k(const void * vx, const float
     }
 }
 
-static __global__ void dequantize_mul_mat_vec_q6_k(const void * vx, const float * yy, float * dst, const int ncols, int nrows) {
+static __global__ void dequantize_mul_mat_vec_q6_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) {
 
     static_assert(16%K_QUANTS_PER_ITERATION == 0, "16 must be divisible by K_QUANTS_PER_ITERATION");
 
@@ -1134,7 +1209,6 @@ static __global__ void dequantize_mul_mat_vec_q6_k(const void * vx, const float
 #endif
 
     // sum up partial sums and write back result
-    __syncthreads();
 #pragma unroll
     for (int mask = 16; mask > 0; mask >>= 1) {
         tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
@@ -1153,8 +1227,43 @@ static __device__ void convert_f16(const void * vx, const int ib, const int iqs,
     v.y = x[ib + iqs + 1];
 }
 
+static __global__ void quantize_q8_1(const float * __restrict__ x, void * __restrict__ vy, const int ndata, const int k) {
+    const int i = blockDim.x*blockIdx.x + threadIdx.x;
+
+    if (i >= k) {
+        return;
+    }
+
+    block_q8_1 * y = (block_q8_1 *) vy;
+
+    const int ib = i / QK8_1; // block index
+    const int iqs = i % QK8_1; // quant index
+
+    const float xi = i < ndata ? x[i] : 0.0f;
+    float amax = fabsf(xi);
+    float sum = xi;
+
+#pragma unroll
+    for (int mask = 16; mask > 0; mask >>= 1) {
+        amax = fmaxf(amax, __shfl_xor_sync(0xffffffff, amax, mask, 32));
+        sum += __shfl_xor_sync(0xffffffff, sum, mask, 32);
+    }
+
+    const float d = amax / 127;
+    const int8_t q = amax == 0.0f ? 0 : roundf(xi / d);
+
+    y[ib].qs[iqs] = q;
+
+    if (iqs > 0) {
+        return;
+    }
+
+    y[ib].d = d;
+    y[ib].s = sum;
+}
+
 template <int qk, int qr, dequantize_kernel_t dequantize_kernel>
-static __global__ void dequantize_block(const void * vx, float * y, const int k) {
+static __global__ void dequantize_block(const void * __restrict__ vx, float * __restrict__ y, const int k) {
     const int i = blockDim.x*blockIdx.x + 2*threadIdx.x;
 
     if (i >= k) {
@@ -1174,8 +1283,402 @@ static __global__ void dequantize_block(const void * vx, float * y, const int k)
     y[iybs + iqs + y_offset] = v.y;
 }
 
+static __device__ __forceinline__ float vec_dot_q4_0_q8_1(
+    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
+#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
+    const block_q4_0 * bq4_0 = (const block_q4_0 *) vbq;
+
+    int vi;
+    memcpy(&vi,  &bq4_0->qs[sizeof(int) * (iqs + 0)], sizeof(int));
+    const int ui0 = *((int *) &bq8_1->qs[sizeof(int) * (iqs + 0)]);
+    const int ui1 = *((int *) &bq8_1->qs[sizeof(int) * (iqs + QI4_0)]);
+
+    const float d = __half2float(bq4_0->d) * __half2float(bq8_1->d);
+
+    // subtract 8 from each quantized value
+    const int vi0 = __vsub4((vi >> 0) & 0x0F0F0F0F, 0x08080808);
+    const int vi1 = __vsub4((vi >> 4) & 0x0F0F0F0F, 0x08080808);
+
+    // SIMD dot product of quantized values
+    int sumi = __dp4a(vi0, ui0, 0);
+    sumi     = __dp4a(vi1, ui1, sumi);
+
+    return sumi*d;
+#else
+    return 0.0f; // only to satisfy the compiler
+#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
+}
+
+static __device__ __forceinline__ float vec_dot_q4_1_q8_1(
+    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
+#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
+    const block_q4_1 * bq4_1 = (const block_q4_1 *) vbq;
+
+    const int vi  = *((int *) &bq4_1->qs[sizeof(int) * (iqs + 0)]);
+    const int ui0 = *((int *) &bq8_1->qs[sizeof(int) * (iqs + 0)]);
+    const int ui1 = *((int *) &bq8_1->qs[sizeof(int) * (iqs + QI4_1)]);
+
+    const float d = __half2float(bq4_1->d) * __half2float(bq8_1->d);
+    const float m = bq4_1->m;
+    const float s = bq8_1->s;
+
+    const int vi0 = (vi >> 0) & 0x0F0F0F0F;
+    const int vi1 = (vi >> 4) & 0x0F0F0F0F;
+
+    // SIMD dot product of quantized values
+    int sumi = __dp4a(vi0, ui0, 0);
+    sumi     = __dp4a(vi1, ui1, sumi);
+
+    return sumi*d + m*s / QI4_1; // scale sum by QI4_1 because there are QI4_1 threads working on this block
+#else
+    return 0.0f; // only to satisfy the compiler
+#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
+}
+
+static __device__ __forceinline__ float vec_dot_q5_0_q8_1(
+    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
+#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
+    const block_q5_0 * bq5_0 = (const block_q5_0 *) vbq;
+
+    int qs;
+    memcpy(&qs, &bq5_0->qs[sizeof(int) * (iqs + 0)], sizeof(int));
+    const int qh0 = bq5_0->qh[iqs/2 + 0] >> 4*(iqs%2);
+    const int qh1 = bq5_0->qh[iqs/2 + 2] >> 4*(iqs%2);
+    const int ui0 = *((int *) &bq8_1->qs[sizeof(int) * (iqs + 0)]);
+    const int ui1 = *((int *) &bq8_1->qs[sizeof(int) * (iqs + QI5_0)]);
+
+    const float d = __half2float(bq5_0->d) * __half2float(bq8_1->d);
+
+    int vi0 = (qs  >>  0) & 0x0F0F0F0F; // lower 4 qs bits, still need qh0 as 5th bits
+    vi0    |= (qh0 <<  4) & 0x00000010; // 1 ->  5
+    vi0    |= (qh0 << 11) & 0x00001000; // 2 -> 13
+    vi0    |= (qh0 << 18) & 0x00100000; // 3 -> 21
+    vi0    |= (qh0 << 25) & 0x10000000; // 4 -> 29
+    vi0     = __vsub4(vi0,  0x10101010); // subtract 16 from quantized values
+    int sumi = __dp4a(vi0, ui0, 0); // SIMD dot product of quantized values
+
+    int vi1 = (qs  >>  4) & 0x0F0F0F0F; // upper 4 qs bits, still need qh1 as 5th bits
+    vi1    |= (qh1 <<  4) & 0x00000010; // 1 ->  5
+    vi1    |= (qh1 << 11) & 0x00001000; // 2 -> 13
+    vi1    |= (qh1 << 18) & 0x00100000; // 3 -> 21
+    vi1    |= (qh1 << 25) & 0x10000000; // 4 -> 29
+    vi1     = __vsub4(vi1,  0x10101010); // subtract 16 from quantized values
+    sumi = __dp4a(vi1, ui1, sumi); // SIMD dot product of quantized values
+
+    return sumi*d;
+#else
+    return 0.0f; // only to satisfy the compiler
+#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
+}
+
+static __device__ __forceinline__ float vec_dot_q5_1_q8_1(
+    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
+#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
+    const block_q5_1 * bq5_1 = (const block_q5_1 *) vbq;
+
+    const int qs  = *((int *) &bq5_1->qs[sizeof(int) * (iqs + 0)]);
+    const int qh0 = bq5_1->qh[iqs/2 + 0] >> 4*(iqs%2);
+    const int qh1 = bq5_1->qh[iqs/2 + 2] >> 4*(iqs%2);
+    const int ui0 = *((int *) &bq8_1->qs[sizeof(int) * (iqs + 0)]);
+    const int ui1 = *((int *) &bq8_1->qs[sizeof(int) * (iqs + QI5_1)]);
+
+    const float d = __half2float(bq5_1->d) * __half2float(bq8_1->d);
+    const float m = bq5_1->m;
+    const float s = bq8_1->s;
+
+    int vi0 = (qs  >>  0) & 0x0F0F0F0F; // lower 4 qs bits, still need qh0 as 5th bits
+    vi0    |= (qh0 <<  4) & 0x00000010; // 1 ->  5
+    vi0    |= (qh0 << 11) & 0x00001000; // 2 -> 13
+    vi0    |= (qh0 << 18) & 0x00100000; // 3 -> 21
+    vi0    |= (qh0 << 25) & 0x10000000; // 4 -> 29
+    int sumi = __dp4a(vi0, ui0, 0); // SIMD dot product of quantized values
+
+    int vi1 = (qs  >>  4) & 0x0F0F0F0F; // upper 4 qs bits, still need qh1 as 5th bits
+    vi1    |= (qh1 <<  4) & 0x00000010; // 1 ->  5
+    vi1    |= (qh1 << 11) & 0x00001000; // 2 -> 13
+    vi1    |= (qh1 << 18) & 0x00100000; // 3 -> 21
+    vi1    |= (qh1 << 25) & 0x10000000; // 4 -> 29
+    sumi = __dp4a(vi1, ui1, sumi); // SIMD dot product of quantized values
+
+    return sumi*d + m*s / QI5_1; // scale sum by QI5_1 because there are QI5_1 threads working on this block
+#else
+    return 0.0f; // only to satisfy the compiler
+#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
+}
+
+static __device__ __forceinline__ float vec_dot_q8_0_q8_1(
+    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
+#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
+    const block_q8_0 * bq8_0 = (const block_q8_0 *) vbq;
+
+    int vi;
+    memcpy(&vi,  &bq8_0->qs[sizeof(int) * (iqs + 0)], sizeof(int));
+    const int ui = *((int *) &bq8_1->qs[sizeof(int) * (iqs + 0)]);
+
+    const float d = __half2float(bq8_0->d) * __half2float(bq8_1->d);
+
+    // SIMD dot product of quantized values
+    int sumi = __dp4a(vi, ui, 0);
+
+    return sumi*d;
+#else
+    return 0.0f; // only to satisfy the compiler
+#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
+}
+
+static __device__ __forceinline__ float vec_dot_q2_K_q8_1(
+    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
+
+#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
+    const block_q2_K * bq2_K = (const block_q2_K *) vbq;
+
+    const int bq8_offset = QR2_K * (iqs / QI8_1);
+    const int scale_offset = iqs - iqs % QI8_1 + (iqs % QI8_1) / (QI8_1/2);
+
+    float sumf_d = 0.0f;
+    float sumf_m = 0.0f;
+
+    const float    d = bq2_K->d;
+    const float dmin = bq2_K->dmin;
+
+    const int v = *((int *) &bq2_K->qs[sizeof(int) * iqs]);
+
+    for (int i = 0; i < QR2_K; ++i) {
+        const int sc = bq2_K->scales[scale_offset + 2*i];
+
+        const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
+        const float d8i = bq8i->d;
+
+        const int vi = (v >> (2*i)) & 0x03030303;
+        const int ui = *((int*) &bq8i->qs[sizeof(int) * (iqs % QI8_1)]);
+
+        sumf_d += d8i * (__dp4a(vi,         ui, 0) * (sc & 0xF)); // SIMD dot product
+        sumf_m += d8i * (__dp4a(0x01010101, ui, 0) * (sc >>  4)); // multiply constant q2_K part with sum of q8_1 values
+    }
+
+    return d*sumf_d - dmin*sumf_m;
+#else
+    return 0.0f; // only to satisfy the compiler
+#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
+}
+
+static __device__ __forceinline__ float vec_dot_q3_K_q8_1(
+    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
+
+#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
+    const block_q3_K * bq3_K = (const block_q3_K *) vbq;
+
+    const int bq8_offset = QR3_K * (iqs / (QI3_K/2));
+    const int scale_offset = iqs - iqs % QI8_1 + (iqs % QI8_1) / (QI8_1/2);
+
+    float sumf = 0.0f;
+
+    const float d = bq3_K->d;
+
+    int vl;
+    memcpy(&vl, &bq3_K->qs[sizeof(int) * iqs], sizeof(int));
+
+    int vh;
+    memcpy(&vh, &bq3_K->hmask[sizeof(int) * (iqs % (QI3_K/2))], sizeof(int));
+    vh = ~vh; // invert the mask so that a 0/1 results in 4/0 being subtracted
+    vh >>= bq8_offset;
+
+    for (int i = 0; i < QR3_K; ++i) {
+        const int isc = scale_offset + 2*i;
+
+        const int isc_low = isc % (QK_K/32);
+        const int sc_shift_low = 4 * (isc / (QK_K/32));
+        const int sc_low  = (bq3_K->scales[isc_low] >> sc_shift_low) & 0xF;
+
+        const int isc_high = isc % (QK_K/64);
+        const int sc_shift_high = 2 * (isc / (QK_K/64));
+        const int sc_high = ((bq3_K->scales[(QK_K/32) + isc_high] >> sc_shift_high) & 3) << 4;
+
+        const int sc = (sc_low | sc_high) - 32;
+
+        const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
+        const int ui = *((int*) &bq8i->qs[sizeof(int) * (iqs % QI8_1)]);
+        const float d8i = bq8i->d;
+
+        const int vil = (vl >> (2*i)) & 0x03030303;
+
+        const int vih = ((vh >> i) << 2) & 0x04040404;
+
+        const int vi = __vsubss4(vil, vih);
+
+        sumf += d8i * (__dp4a(vi, ui, 0) * sc); // SIMD dot product
+    }
+
+    return d*sumf;
+#else
+    return 0.0f; // only to satisfy the compiler
+#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
+}
+
+static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
+    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
+
+#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
+    const block_q4_K * bq4_K = (const block_q4_K *) vbq;
+
+    const int bq8_offset = QR4_K * (iqs / QI8_1);
+
+    float sumf_d = 0.0f;
+    float sumf_m = 0.0f;
+
+    const float    d = bq4_K->d;
+    const float dmin = bq4_K->dmin;
+
+    const int v = *((int *) &bq4_K->qs[sizeof(int) * iqs]);
+
+    for (int i = 0; i < QR4_K; ++i) {
+        const int isc = bq8_offset + i;
+
+        uint8_t sc, m;
+        get_scale_min_k4(isc, bq4_K->scales, sc, m);
+
+        const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
+        const int ui = *((int*) &bq8i->qs[sizeof(int) * (iqs % QI8_1)]);
+        const float d8i = bq8i->d;
+
+        const int vi = (v >> (4*i)) & 0x0F0F0F0F;
+
+        sumf_d += d8i * (__dp4a(vi,         ui, 0) * sc); // SIMD dot product
+        sumf_m += d8i * (__dp4a(0x01010101, ui, 0) * m);  // multiply constant part of q4_K with sum of q8_1 values
+    }
+
+    return d*sumf_d - dmin*sumf_m;
+#else
+    return 0.0f; // only to satisfy the compiler
+#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
+}
+
+static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
+    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
+
+#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
+    const block_q5_K * bq5_K = (const block_q5_K *) vbq;
+
+    const int bq8_offset = QR5_K * (iqs / QI8_1);
+
+    float sumf_d = 0.0f;
+    float sumf_m = 0.0f;
+
+    const float    d = bq5_K->d;
+    const float dmin = bq5_K->dmin;
+
+    const int vl = *((int *) &bq5_K->qs[sizeof(int) * iqs]);
+
+    const int vh = (*((int *) &bq5_K->qh[sizeof(int) * (iqs % (QI5_K/4))])) >> bq8_offset;
+
+    for (int i = 0; i < QR5_K; ++i) {
+        const int isc = bq8_offset + i;
+
+        uint8_t sc, m;
+        get_scale_min_k4(isc, bq5_K->scales, sc, m);
+
+        const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
+        const int ui = *((int*) &bq8i->qs[sizeof(int) * (iqs % QI8_1)]);
+        const float d8i = bq8i->d;
+
+        const int vil = (vl >> (4*i)) & 0x0F0F0F0F;
+
+        const int vih = ((vh >> i) << 4) & 0x10101010;
+
+        const int vi = vil | vih;
+
+        sumf_d += d8i * (__dp4a(vi,         ui, 0) * sc); // SIMD dot product
+        sumf_m += d8i * (__dp4a(0x01010101, ui, 0) * m);  // multiply constant part of q5_K with sum of q8_1 values
+    }
+
+    return d*sumf_d - dmin*sumf_m;
+#else
+    return 0.0f; // only to satisfy the compiler
+#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
+}
+
+static __device__ __forceinline__ float vec_dot_q6_K_q8_1(
+    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
+
+#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
+    const block_q6_K * bq6_K = (const block_q6_K *) vbq;
+
+    const int bq8_offset = 2 * QR6_K * (iqs / (QI6_K/2)) + (iqs % (QI6_K/2)) / (QI6_K/4);
+    const int scale_offset = (QI6_K/4) * (iqs / (QI6_K/2)) + (iqs % (QI6_K/2)) / (QI6_K/8);
+    const int vh_shift = 2 * ((iqs % (QI6_K/2)) / (QI6_K/4));
+
+    float sumf = 0.0f;
+
+    const float d = bq6_K->d;
+
+    int vl;
+    memcpy(&vl, &bq6_K->ql[sizeof(int) * iqs], sizeof(int));
+
+    int vh;
+    memcpy(&vh, &bq6_K->qh[sizeof(int) * ((QI6_K/4) * (iqs / (QI6_K/2)) + iqs % (QI6_K/4))], sizeof(int));
+
+    for (int i = 0; i < QR6_K; ++i) {
+        const int sc = bq6_K->scales[scale_offset + 4*i];
+
+        const block_q8_1 * bq8i = bq8_1 + bq8_offset + 2*i;
+        const int ui = *((int*) &bq8i->qs[sizeof(int) * (iqs % (QI8_1))]);
+        const float d8i = bq8i->d;
+
+        const int vil = (vl >> (4*i)) & 0x0F0F0F0F;
+
+        const int vih = ((vh >> (vh_shift + 4*i)) << 4) & 0x30303030;
+
+        const int vi = __vsubss4((vil | vih), 0x20202020); // vi = (vil | vih) - 32
+
+        sumf += d8i * (__dp4a(vi, ui, 0) * sc); // SIMD dot product
+    }
+
+    return d*sumf;
+#else
+    return 0.0f; // only to satisfy the compiler
+#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
+}
+
+template <int qk, int qi, typename block_q_t, vec_dot_q_cuda_t vec_dot_q_cuda>
+static __global__ void mul_mat_vec_q(const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, const int ncols, const int nrows) {
+    const int row = blockIdx.y*blockDim.y + threadIdx.y;
+
+    if (row >= nrows) {
+        return;
+    }
+
+    const int blocks_per_row = ncols / qk;
+    const int blocks_per_warp = WARP_SIZE / qi;
+
+// partial sum for each thread
+    float tmp = 0.0f;
+
+    const block_q_t  * x = (const block_q_t  *) vx;
+    const block_q8_1 * y = (const block_q8_1 *) vy;
+
+    for (int i = 0; i < blocks_per_row; i += blocks_per_warp) {
+        const int ibx = row*blocks_per_row + i + threadIdx.x / qi; // x block index
+
+        const int iby = (i + threadIdx.x / qi) * qk/QK8_1; // y block index that aligns with ibx
+
+        const int iqs  = threadIdx.x % qi; // x block quant index when casting the quants to int
+
+        tmp += vec_dot_q_cuda(&x[ibx], &y[iby], iqs);
+    }
+
+    // sum up partial sums and write back result
+#pragma unroll
+    for (int mask = 16; mask > 0; mask >>= 1) {
+        tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
+    }
+
+    if (threadIdx.x == 0) {
+        dst[row] = tmp;
+    }
+}
+
 template <int qk, int qr, dequantize_kernel_t dequantize_kernel>
-static __global__ void dequantize_mul_mat_vec(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows) {
+static __global__ void dequantize_mul_mat_vec(const void * __restrict__ vx, const dfloat * __restrict__ y, float * __restrict__ dst, const int ncols, const int nrows) {
     // qk = quantized weights per x block
     // qr = number of quantized weights per data value in x block
     const int row = blockIdx.y*blockDim.y + threadIdx.y;
@@ -1228,7 +1731,6 @@ static __global__ void dequantize_mul_mat_vec(const void * vx, const dfloat * y,
     }
 
     // sum up partial sums and write back result
-    __syncthreads();
 #pragma unroll
     for (int mask = 16; mask > 0; mask >>= 1) {
         tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
@@ -1243,7 +1745,7 @@ static __global__ void dequantize_mul_mat_vec(const void * vx, const dfloat * y,
     }
 }
 
-static __global__ void mul_mat_p021_f16_f32(const void * vx, const float * y, float * dst, const int ncols_x, const int nrows_x, const int nchannels_x) {
+static __global__ void mul_mat_p021_f16_f32(const void * __restrict__ vx, const float * __restrict__ y, float * __restrict__ dst, const int ncols_x, const int nrows_x, const int nchannels_x) {
     const half * x = (const half *) vx;
 
     const int row_x = blockDim.y*blockIdx.y + threadIdx.y;
@@ -1279,7 +1781,6 @@ static __global__ void mul_mat_p021_f16_f32(const void * vx, const float * y, fl
     const int idst = channel*nrows_dst + row_dst;
 
     // sum up partial sums and write back result
-    __syncthreads();
 #pragma unroll
     for (int mask = 16; mask > 0; mask >>= 1) {
         tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
@@ -1291,7 +1792,7 @@ static __global__ void mul_mat_p021_f16_f32(const void * vx, const float * y, fl
 }
 
 static __global__ void mul_mat_vec_nc_f16_f32( // nc == non-contiguous
-    const void * vx, const float * y, float * dst, const int ncols_x, const int nrows_x,
+    const void * __restrict__ vx, const float * __restrict__ y, float * __restrict__ dst, const int ncols_x, const int nrows_x,
     const int row_stride_x, const int channel_stride_x) {
 
     const half * x = (const half *) vx;
@@ -1325,7 +1826,6 @@ static __global__ void mul_mat_vec_nc_f16_f32( // nc == non-contiguous
     }
 
     // sum up partial sums and write back result
-    __syncthreads();
 #pragma unroll
     for (int mask = 16; mask > 0; mask >>= 1) {
         tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
@@ -1397,6 +1897,40 @@ static __global__ void rope_f32(const float * x, float * dst, const int ncols, c
     dst[i + 1] = x0*sin_theta + x1*cos_theta;
 }
 
+static __global__ void rope_glm_f32(const float * x, float * dst, const int ncols, const float p, const float block_p, const float theta_scale) {
+    const int col = blockDim.x*blockIdx.x + threadIdx.x;
+    const int half_n_dims = ncols/4;
+
+    if (col >= half_n_dims) {
+        return;
+    }
+
+    const int row = blockDim.y*blockIdx.y + threadIdx.y;
+    const int i = row*ncols + col;
+
+    const float col_theta_scale = powf(theta_scale, col);
+
+    const float theta = p*col_theta_scale;
+    const float sin_theta = sinf(theta);
+    const float cos_theta = cosf(theta);
+
+    const float x0 = x[i + 0];
+    const float x1 = x[i + half_n_dims];
+
+    dst[i + 0]           = x0*cos_theta - x1*sin_theta;
+    dst[i + half_n_dims] = x0*sin_theta + x1*cos_theta;
+
+    const float block_theta = block_p*col_theta_scale;
+    const float sin_block_theta = sinf(block_theta);
+    const float cos_block_theta = cosf(block_theta);
+
+    const float x2 = x[i + half_n_dims * 2];
+    const float x3 = x[i + half_n_dims * 3];
+
+    dst[i + half_n_dims * 2] = x2*cos_block_theta - x3*sin_block_theta;
+    dst[i + half_n_dims * 3] = x2*sin_block_theta + x3*cos_block_theta;
+}
+
 static __global__ void diag_mask_inf_f32(const float * x, float * dst, const int ncols, const int rows_per_channel, const int n_past) {
     const int col = blockDim.x*blockIdx.x + threadIdx.x;
     const int row = blockDim.y*blockIdx.y + threadIdx.y;
@@ -1435,7 +1969,6 @@ static __global__ void soft_max_f32(const float * x, float * dst, const int ncol
     }
 
     // sum up partial sums
-    __syncthreads();
 #pragma unroll
     for (int mask = 16; mask > 0; mask >>= 1) {
         tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
@@ -1463,9 +1996,9 @@ static __global__ void scale_f32(const float * x, float * dst, const float scale
     dst[i] = scale * x[i];
 }
 
-static void add_f32_cuda(const float * x, const float * y, float * dst, const int k, cudaStream_t stream) {
-    const int num_blocks = (k + CUDA_ADD_BLOCK_SIZE - 1) / CUDA_ADD_BLOCK_SIZE;
-    add_f32<<<num_blocks, CUDA_ADD_BLOCK_SIZE, 0, stream>>>(x, y, dst, k);
+static void add_f32_cuda(const float * x, const float * y, float * dst, const int kx, const int ky, cudaStream_t stream) {
+    const int num_blocks = (kx + CUDA_ADD_BLOCK_SIZE - 1) / CUDA_ADD_BLOCK_SIZE;
+    add_f32<<<num_blocks, CUDA_ADD_BLOCK_SIZE, 0, stream>>>(x, y, dst, kx, ky);
 }
 
 static void add_f16_f32_f16_cuda(const half * x, const float * y, half * dst, const int k, cudaStream_t stream) {
@@ -1478,17 +2011,33 @@ static void mul_f32_cuda(const float * x, const float * y, float * dst, const in
     mul_f32<<<num_blocks, CUDA_MUL_BLOCK_SIZE, 0, stream>>>(x, y, dst, kx, ky);
 }
 
+static void gelu_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
+    const int num_blocks = (k + CUDA_GELU_BLOCK_SIZE - 1) / CUDA_GELU_BLOCK_SIZE;
+    gelu_f32<<<num_blocks, CUDA_GELU_BLOCK_SIZE, 0, stream>>>(x, dst, k);
+}
+
 static void silu_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
     const int num_blocks = (k + CUDA_SILU_BLOCK_SIZE - 1) / CUDA_SILU_BLOCK_SIZE;
     silu_f32<<<num_blocks, CUDA_SILU_BLOCK_SIZE, 0, stream>>>(x, dst, k);
 }
 
+static void norm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
+    GGML_ASSERT(ncols % WARP_SIZE == 0);
+    const dim3 block_dims(WARP_SIZE, 1, 1);
+    norm_f32<<<nrows, block_dims, 0, stream>>>(x, dst, ncols);
+}
+
 static void rms_norm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
     GGML_ASSERT(ncols % WARP_SIZE == 0);
     const dim3 block_dims(WARP_SIZE, 1, 1);
     rms_norm_f32<<<nrows, block_dims, 0, stream>>>(x, dst, ncols);
 }
 
+static void quantize_row_q8_1_cuda(const float * x, void * vy, const int ndata, const int k, cudaStream_t stream) {
+    const int num_blocks = (k + CUDA_QUANTIZE_BLOCK_SIZE - 1) / CUDA_QUANTIZE_BLOCK_SIZE;
+    quantize_q8_1<<<num_blocks, CUDA_QUANTIZE_BLOCK_SIZE, 0, stream>>>(x, vy, ndata, k);
+}
+
 static void dequantize_row_q4_0_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
     const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
     dequantize_block<QK4_0, QR4_0, dequantize_q4_0><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
@@ -1557,45 +2106,45 @@ static void dequantize_row_q6_K_cuda(const void * vx, float * y, const int k, cu
 
 static void dequantize_mul_mat_vec_q4_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
     GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
-    const int block_num_y = (nrows + GGML_CUDA_DMMV_Y - 1) / GGML_CUDA_DMMV_Y;
+    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
     const dim3 block_nums(1, block_num_y, 1);
-    const dim3 block_dims(WARP_SIZE, GGML_CUDA_DMMV_Y, 1);
+    const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
     dequantize_mul_mat_vec<QK4_0, QR4_0, dequantize_q4_0>
         <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
 }
 
 static void dequantize_mul_mat_vec_q4_1_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
     GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
-    const int block_num_y = (nrows + GGML_CUDA_DMMV_Y - 1) / GGML_CUDA_DMMV_Y;
+    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
     const dim3 block_nums(1, block_num_y, 1);
-    const dim3 block_dims(WARP_SIZE, GGML_CUDA_DMMV_Y, 1);
+    const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
     dequantize_mul_mat_vec<QK4_1, QR4_1, dequantize_q4_1>
         <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
 }
 
 static void dequantize_mul_mat_vec_q5_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
     GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
-    const int block_num_y = (nrows + GGML_CUDA_DMMV_Y - 1) / GGML_CUDA_DMMV_Y;
+    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
     const dim3 block_nums(1, block_num_y, 1);
-    const dim3 block_dims(WARP_SIZE, GGML_CUDA_DMMV_Y, 1);
+    const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
     dequantize_mul_mat_vec<QK5_0, QR5_0, dequantize_q5_0>
         <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
 }
 
 static void dequantize_mul_mat_vec_q5_1_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
     GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
-    const int block_num_y = (nrows + GGML_CUDA_DMMV_Y - 1) / GGML_CUDA_DMMV_Y;
+    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
     const dim3 block_nums(1, block_num_y, 1);
-    const dim3 block_dims(WARP_SIZE, GGML_CUDA_DMMV_Y, 1);
+    const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
     dequantize_mul_mat_vec<QK5_1, QR5_1, dequantize_q5_1>
         <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
 }
 
 static void dequantize_mul_mat_vec_q8_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
     GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
-    const int block_num_y = (nrows + GGML_CUDA_DMMV_Y - 1) / GGML_CUDA_DMMV_Y;
+    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
     const dim3 block_nums(1, block_num_y, 1);
-    const dim3 block_dims(WARP_SIZE, GGML_CUDA_DMMV_Y, 1);
+    const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
     dequantize_mul_mat_vec<QK8_0, QR8_0, dequantize_q8_0>
         <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
 }
@@ -1642,6 +2191,96 @@ static void dequantize_mul_mat_vec_q6_K_cuda(const void * vx, const float * y, f
     dequantize_mul_mat_vec_q6_k<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
 }
 
+static void mul_mat_vec_q4_0_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
+    GGML_ASSERT(ncols % QK4_0 == 0);
+    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
+    const dim3 block_nums(1, block_num_y, 1);
+    const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
+    mul_mat_vec_q<QK4_0, QI4_0, block_q4_0, vec_dot_q4_0_q8_1>
+        <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
+}
+
+static void mul_mat_vec_q4_1_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
+    GGML_ASSERT(ncols % QK4_1 == 0);
+    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
+    const dim3 block_nums(1, block_num_y, 1);
+    const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
+    mul_mat_vec_q<QK4_0, QI4_1, block_q4_1, vec_dot_q4_1_q8_1>
+        <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
+}
+
+static void mul_mat_vec_q5_0_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
+    GGML_ASSERT(ncols % QK5_0 == 0);
+    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
+    const dim3 block_nums(1, block_num_y, 1);
+    const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
+    mul_mat_vec_q<QK5_0, QI5_0, block_q5_0, vec_dot_q5_0_q8_1>
+        <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
+}
+
+static void mul_mat_vec_q5_1_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
+    GGML_ASSERT(ncols % QK5_1 == 0);
+    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
+    const dim3 block_nums(1, block_num_y, 1);
+    const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
+    mul_mat_vec_q<QK5_1, QI5_1, block_q5_1, vec_dot_q5_1_q8_1>
+        <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
+}
+
+static void mul_mat_vec_q8_0_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
+    GGML_ASSERT(ncols % QK8_0 == 0);
+    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
+    const dim3 block_nums(1, block_num_y, 1);
+    const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
+    mul_mat_vec_q<QK8_0, QI8_0, block_q8_0, vec_dot_q8_0_q8_1>
+        <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
+}
+
+static void mul_mat_vec_q2_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
+    GGML_ASSERT(ncols % QK_K == 0);
+    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
+    const dim3 block_nums(1, block_num_y, 1);
+    const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
+    mul_mat_vec_q<QK_K, QI2_K, block_q2_K, vec_dot_q2_K_q8_1>
+        <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
+}
+
+static void mul_mat_vec_q3_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
+    GGML_ASSERT(ncols % QK_K == 0);
+    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
+    const dim3 block_nums(1, block_num_y, 1);
+    const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
+    mul_mat_vec_q<QK_K, QI3_K, block_q3_K, vec_dot_q3_K_q8_1>
+        <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
+}
+
+static void mul_mat_vec_q4_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
+    GGML_ASSERT(ncols % QK_K == 0);
+    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
+    const dim3 block_nums(1, block_num_y, 1);
+    const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
+    mul_mat_vec_q<QK_K, QI4_K, block_q4_K, vec_dot_q4_K_q8_1>
+        <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
+}
+
+static void mul_mat_vec_q5_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
+    GGML_ASSERT(ncols % QK_K == 0);
+    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
+    const dim3 block_nums(1, block_num_y, 1);
+    const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
+    mul_mat_vec_q<QK_K, QI5_K, block_q5_K, vec_dot_q5_K_q8_1>
+        <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
+}
+
+static void mul_mat_vec_q6_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
+    GGML_ASSERT(ncols % QK_K == 0);
+    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
+    const dim3 block_nums(1, block_num_y, 1);
+    const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
+    mul_mat_vec_q<QK_K, QI6_K, block_q6_K, vec_dot_q6_K_q8_1>
+        <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
+}
+
 static void convert_fp16_to_fp32_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
     const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
     dequantize_block<1, 1, convert_f16><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
@@ -1649,9 +2288,9 @@ static void convert_fp16_to_fp32_cuda(const void * vx, float * y, const int k, c
 
 static void convert_mul_mat_vec_f16_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
     GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
-    const int block_num_y = (nrows + GGML_CUDA_DMMV_Y - 1) / GGML_CUDA_DMMV_Y;
+    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
     const dim3 block_nums(1, block_num_y, 1);
-    const dim3 block_dims(WARP_SIZE, GGML_CUDA_DMMV_Y, 1);
+    const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
     dequantize_mul_mat_vec<1, 1, convert_f16>
         <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
 }
@@ -1734,6 +2373,14 @@ static void rope_f32_cuda(const float * x, float * dst, const int ncols, const i
     rope_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, p, theta_scale);
 }
 
+static void rope_glm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float p, const float block_p, const float theta_scale, cudaStream_t stream) {
+    GGML_ASSERT(nrows % 4 == 0);
+    const dim3 block_dims(4*CUDA_ROPE_BLOCK_SIZE, 1, 1);
+    const int num_blocks_x = (ncols + 4*CUDA_ROPE_BLOCK_SIZE - 1) / (4*CUDA_ROPE_BLOCK_SIZE);
+    const dim3 block_nums(num_blocks_x, nrows, 1);
+    rope_glm_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, p, block_p, theta_scale);
+}
+
 static void diag_mask_inf_f32_cuda(const float * x, float * dst, const int ncols_x, const int nrows_x, const int rows_per_channel, const int n_past, cudaStream_t stream) {
     const dim3 block_dims(CUDA_DIAG_MASK_INF_BLOCK_SIZE, 1, 1);
     const int block_num_x = (ncols_x + CUDA_DIAG_MASK_INF_BLOCK_SIZE - 1) / CUDA_DIAG_MASK_INF_BLOCK_SIZE;
@@ -1817,6 +2464,7 @@ static size_t g_scratch_offset = 0;
 
 static int g_device_count = -1;
 static int g_main_device = 0;
+static int g_compute_capabilities[GGML_CUDA_MAX_DEVICES];
 static float g_tensor_split[GGML_CUDA_MAX_DEVICES] = {0};
 
 static cublasHandle_t g_cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr};
@@ -1834,9 +2482,12 @@ void ggml_init_cublas() {
         for (int id = 0; id < g_device_count; ++id) {
             cudaDeviceProp prop;
             CUDA_CHECK(cudaGetDeviceProperties(&prop, id));
-            fprintf(stderr, "  Device %d: %s\n", id, prop.name);
+            fprintf(stderr, "  Device %d: %s, compute capability %d.%d\n", id, prop.name, prop.major, prop.minor);
+
             g_tensor_split[id] = total_vram;
             total_vram += prop.totalGlobalMem;
+
+            g_compute_capabilities[id] = 100*prop.major + 10*prop.minor;
         }
         for (int id = 0; id < g_device_count; ++id) {
             g_tensor_split[id] /= total_vram;
@@ -1957,20 +2608,22 @@ inline void ggml_cuda_op_add(
 
     GGML_ASSERT(src0_ddq_i != nullptr || src0_ddf_i != nullptr);
     GGML_ASSERT(src1_ddf_i != nullptr);
-    GGML_ASSERT(dst_ddf_i != nullptr);
+    GGML_ASSERT(dst_ddf_i  != nullptr);
 
-    const int64_t ne0 = src0->ne[0];
+    const int64_t ne00 = src0->ne[0];
     const int64_t i01_diff = i01_high - i01_low;
 
+    const int64_t ne10 = src1->ne[0];
+    const int64_t ne11 = src1->ne[1];
+
     // compute
     if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
-        add_f32_cuda(src0_ddf_i, src1_ddf_i, dst_ddf_i, ne0*i01_diff, cudaStream_main);
+        add_f32_cuda(src0_ddf_i, src1_ddf_i, dst_ddf_i, ne00*i01_diff, ne10*ne11, cudaStream_main);
     } else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) {
-        add_f16_f32_f16_cuda((half *) src0_ddq_i, src1_ddf_i, (half *) dst_ddf_i, ne0*i01_diff, cudaStream_main);
+        add_f16_f32_f16_cuda((half *) src0_ddq_i, src1_ddf_i, (half *) dst_ddf_i, ne00*i01_diff, cudaStream_main);
     } else {
         GGML_ASSERT(false);
     }
-    CUDA_CHECK(cudaGetLastError());
 
     (void) src1;
     (void) dst;
@@ -1986,30 +2639,43 @@ inline void ggml_cuda_op_mul(
 
     GGML_ASSERT(src0_ddf_i != nullptr);
     GGML_ASSERT(src1_ddf_i != nullptr);
-    GGML_ASSERT(dst_ddf_i != nullptr);
+    GGML_ASSERT(dst_ddf_i  != nullptr);
 
     const int64_t ne00 = src0->ne[0];
+    const int64_t i01_diff = i01_high - i01_low;
 
     const int64_t ne10 = src1->ne[0];
     const int64_t ne11 = src1->ne[1];
 
-    for (int64_t i01 = i01_low; i01 < i01_high; i01++) {
-        const int64_t i11 = i1*ne11 + i01%ne11; // broadcast src1 across src0
-
-        float * src0_ddf_i01 = src0_ddf_i + i01*ne00;
-        float * src1_ddf_i01 = src1_ddf_i + i11*ne10;
-        float * dst_ddf_i01 = dst_ddf_i + i01*ne00;
-
-        // compute
-        mul_f32_cuda(src0_ddf_i01, src1_ddf_i01, dst_ddf_i01, ne00, ne10, cudaStream_main);
-        CUDA_CHECK(cudaGetLastError());
-    }
+    mul_f32_cuda(src0_ddf_i, src1_ddf_i, dst_ddf_i, ne00*i01_diff, ne10*ne11, cudaStream_main);
 
     (void) dst;
     (void) src0_ddq_i;
     (void) i02;
 }
 
+inline void ggml_cuda_op_gelu(
+    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
+    float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
+    cudaStream_t & cudaStream_main){
+
+    GGML_ASSERT(src0_ddf_i != nullptr);
+    GGML_ASSERT(dst_ddf_i != nullptr);
+
+    const int64_t ne00 = src0->ne[0];
+    const int64_t i01_diff = i01_high - i01_low;
+
+    // compute
+    gelu_f32_cuda(src0_ddf_i, dst_ddf_i, ne00*i01_diff, cudaStream_main);
+
+    (void) src1;
+    (void) dst;
+    (void) src0_ddq_i;
+    (void) src1_ddf_i;
+    (void) i02;
+    (void) i1;
+}
+
 inline void ggml_cuda_op_silu(
     const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
     float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
@@ -2023,7 +2689,28 @@ inline void ggml_cuda_op_silu(
 
     // compute
     silu_f32_cuda(src0_ddf_i, dst_ddf_i, ne00*i01_diff, cudaStream_main);
-    CUDA_CHECK(cudaGetLastError());
+
+    (void) src1;
+    (void) dst;
+    (void) src0_ddq_i;
+    (void) src1_ddf_i;
+    (void) i02;
+    (void) i1;
+}
+
+inline void ggml_cuda_op_norm(
+    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
+    float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
+    cudaStream_t & cudaStream_main){
+
+    GGML_ASSERT(src0_ddf_i != nullptr);
+    GGML_ASSERT(dst_ddf_i != nullptr);
+
+    const int64_t ne00 = src0->ne[0];
+    const int64_t i01_diff = i01_high - i01_low;
+
+    // compute
+    norm_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, cudaStream_main);
 
     (void) src1;
     (void) dst;
@@ -2046,7 +2733,6 @@ inline void ggml_cuda_op_rms_norm(
 
     // compute
     rms_norm_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, cudaStream_main);
-    CUDA_CHECK(cudaGetLastError());
 
     (void) src1;
     (void) dst;
@@ -2056,7 +2742,7 @@ inline void ggml_cuda_op_rms_norm(
     (void) i1;
 }
 
-inline void ggml_cuda_op_dequantize_mul_mat_vec(
+inline void ggml_cuda_op_mul_mat_vec(
     const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
     float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
     cudaStream_t & cudaStream_main){
@@ -2068,70 +2754,139 @@ inline void ggml_cuda_op_dequantize_mul_mat_vec(
     const int64_t ne00 = src0->ne[0];
     const int64_t nrows = i01_high - i01_low;
 
-// on some GPUs it is faster to convert src1 to half and to use half precision intrinsics
-#ifdef GGML_CUDA_DMMV_F16
-    size_t ash;
-    dfloat * src1_dfloat = nullptr; // dfloat == half
-
-    bool src1_convert_f16 = src0->type == GGML_TYPE_Q4_0 || src0->type == GGML_TYPE_Q4_1 ||
-        src0->type == GGML_TYPE_Q5_0 || src0->type == GGML_TYPE_Q5_1 ||
-        src0->type == GGML_TYPE_Q8_0 || src0->type == GGML_TYPE_F16;
-
-    if (src1_convert_f16) {
-        src1_dfloat = (half *) ggml_cuda_pool_malloc(ne00*sizeof(half), &ash);
-        ggml_cpy_f32_f16_cuda((char *) src1_ddf_i, (char *) src1_dfloat, ne00,
-                                ne00, 1, sizeof(float), 0, 0,
-                                ne00, 1, sizeof(half),  0, 0, cudaStream_main);
-    }
+#ifdef GGML_CUDA_FORCE_DMMV
+    const bool use_mul_mat_vec_q = false;
 #else
-    dfloat * src1_dfloat = src1_ddf_i; // dfloat == float, no conversion
+    int id;
+    CUDA_CHECK(cudaGetDevice(&id));
+
+    bool mul_mat_vec_q_implemented =
+        src0->type == GGML_TYPE_Q4_0 ||
+        src0->type == GGML_TYPE_Q4_1 ||
+        src0->type == GGML_TYPE_Q5_0 ||
+        src0->type == GGML_TYPE_Q5_1 ||
+        src0->type == GGML_TYPE_Q8_0;
+#if QK_K == 256
+    mul_mat_vec_q_implemented = mul_mat_vec_q_implemented ||
+        src0->type == GGML_TYPE_Q2_K ||
+        src0->type == GGML_TYPE_Q3_K ||
+        src0->type == GGML_TYPE_Q4_K ||
+        src0->type == GGML_TYPE_Q5_K ||
+        src0->type == GGML_TYPE_Q6_K;
+#endif // QK_K == 256
+
+    const bool use_mul_mat_vec_q = g_compute_capabilities[id] >= MIN_CC_DP4A && mul_mat_vec_q_implemented;
+#endif
+
+    if (use_mul_mat_vec_q) {
+        int64_t padded_row_size = ne00 + MATRIX_ROW_PADDING - 1;
+        padded_row_size -= padded_row_size % MATRIX_ROW_PADDING;
+        size_t as;
+        void * src1_q8_1 = ggml_cuda_pool_malloc(padded_row_size*sizeof(block_q8_1)/QK8_1, &as);
+        quantize_row_q8_1_cuda(src1_ddf_i, src1_q8_1, ne00, padded_row_size, cudaStream_main);
+
+        switch (src0->type) {
+            case GGML_TYPE_Q4_0:
+                mul_mat_vec_q4_0_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
+                break;
+            case GGML_TYPE_Q4_1:
+                mul_mat_vec_q4_1_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
+                break;
+            case GGML_TYPE_Q5_0:
+                mul_mat_vec_q5_0_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
+                break;
+            case GGML_TYPE_Q5_1:
+                mul_mat_vec_q5_1_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
+                break;
+            case GGML_TYPE_Q8_0:
+                mul_mat_vec_q8_0_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
+                break;
+            case GGML_TYPE_Q2_K:
+                mul_mat_vec_q2_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
+                break;
+            case GGML_TYPE_Q3_K:
+                mul_mat_vec_q3_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
+                break;
+            case GGML_TYPE_Q4_K:
+                mul_mat_vec_q4_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
+                break;
+            case GGML_TYPE_Q5_K:
+                mul_mat_vec_q5_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
+                break;
+            case GGML_TYPE_Q6_K:
+                mul_mat_vec_q6_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
+                break;
+            default:
+                GGML_ASSERT(false);
+                break;
+        }
+
+        ggml_cuda_pool_free(src1_q8_1, as);
+    } else {
+        // on some GPUs it is faster to convert src1 to half and to use half precision intrinsics
+#ifdef GGML_CUDA_DMMV_F16
+        size_t ash;
+        dfloat * src1_dfloat = nullptr; // dfloat == half
+
+        bool src1_convert_f16 = src0->type == GGML_TYPE_Q4_0 || src0->type == GGML_TYPE_Q4_1 ||
+            src0->type == GGML_TYPE_Q5_0 || src0->type == GGML_TYPE_Q5_1 ||
+            src0->type == GGML_TYPE_Q8_0 || src0->type == GGML_TYPE_F16;
+
+        if (src1_convert_f16) {
+            src1_dfloat = (half *) ggml_cuda_pool_malloc(ne00*sizeof(half), &ash);
+            ggml_cpy_f32_f16_cuda((char *) src1_ddf_i, (char *) src1_dfloat, ne00,
+                                    ne00, 1, sizeof(float), 0, 0,
+                                    ne00, 1, sizeof(half),  0, 0, cudaStream_main);
+        }
+#else
+        dfloat * src1_dfloat = src1_ddf_i; // dfloat == float, no conversion
 #endif // GGML_CUDA_DMMV_F16
 
-    switch (src0->type) {
-        case GGML_TYPE_Q4_0:
-            dequantize_mul_mat_vec_q4_0_cuda(src0_ddq_i, src1_dfloat, dst_ddf_i, ne00, nrows, cudaStream_main);
-            break;
-        case GGML_TYPE_Q4_1:
-            dequantize_mul_mat_vec_q4_1_cuda(src0_ddq_i, src1_dfloat, dst_ddf_i, ne00, nrows, cudaStream_main);
-            break;
-        case GGML_TYPE_Q5_0:
-            dequantize_mul_mat_vec_q5_0_cuda(src0_ddq_i, src1_dfloat, dst_ddf_i, ne00, nrows, cudaStream_main);
-            break;
-        case GGML_TYPE_Q5_1:
-            dequantize_mul_mat_vec_q5_1_cuda(src0_ddq_i, src1_dfloat, dst_ddf_i, ne00, nrows, cudaStream_main);
-            break;
-        case GGML_TYPE_Q8_0:
-            dequantize_mul_mat_vec_q8_0_cuda(src0_ddq_i, src1_dfloat, dst_ddf_i, ne00, nrows, cudaStream_main);
-            break;
-        case GGML_TYPE_Q2_K:
-            dequantize_mul_mat_vec_q2_K_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main);
-            break;
-        case GGML_TYPE_Q3_K:
-            dequantize_mul_mat_vec_q3_K_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main);
-            break;
-        case GGML_TYPE_Q4_K:
-            dequantize_mul_mat_vec_q4_K_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main);
-            break;
-        case GGML_TYPE_Q5_K:
-            dequantize_mul_mat_vec_q5_K_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main);
-            break;
-        case GGML_TYPE_Q6_K:
-            dequantize_mul_mat_vec_q6_K_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main);
-            break;
-        case GGML_TYPE_F16:
-            convert_mul_mat_vec_f16_cuda(src0_ddq_i, src1_dfloat, dst_ddf_i, ne00, nrows, cudaStream_main);
-            break;
-        default:
-            GGML_ASSERT(false);
-            break;
-    }
-    CUDA_CHECK(cudaGetLastError());
+        switch (src0->type) {
+            case GGML_TYPE_Q4_0:
+                dequantize_mul_mat_vec_q4_0_cuda(src0_ddq_i, src1_dfloat, dst_ddf_i, ne00, nrows, cudaStream_main);
+                break;
+            case GGML_TYPE_Q4_1:
+                dequantize_mul_mat_vec_q4_1_cuda(src0_ddq_i, src1_dfloat, dst_ddf_i, ne00, nrows, cudaStream_main);
+                break;
+            case GGML_TYPE_Q5_0:
+                dequantize_mul_mat_vec_q5_0_cuda(src0_ddq_i, src1_dfloat, dst_ddf_i, ne00, nrows, cudaStream_main);
+                break;
+            case GGML_TYPE_Q5_1:
+                dequantize_mul_mat_vec_q5_1_cuda(src0_ddq_i, src1_dfloat, dst_ddf_i, ne00, nrows, cudaStream_main);
+                break;
+            case GGML_TYPE_Q8_0:
+                dequantize_mul_mat_vec_q8_0_cuda(src0_ddq_i, src1_dfloat, dst_ddf_i, ne00, nrows, cudaStream_main);
+                break;
+            case GGML_TYPE_Q2_K:
+                dequantize_mul_mat_vec_q2_K_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main);
+                break;
+            case GGML_TYPE_Q3_K:
+                dequantize_mul_mat_vec_q3_K_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main);
+                break;
+            case GGML_TYPE_Q4_K:
+                dequantize_mul_mat_vec_q4_K_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main);
+                break;
+            case GGML_TYPE_Q5_K:
+                dequantize_mul_mat_vec_q5_K_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main);
+                break;
+            case GGML_TYPE_Q6_K:
+                dequantize_mul_mat_vec_q6_K_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main);
+                break;
+            case GGML_TYPE_F16:
+                convert_mul_mat_vec_f16_cuda(src0_ddq_i, src1_dfloat, dst_ddf_i, ne00, nrows, cudaStream_main);
+                break;
+            default:
+                GGML_ASSERT(false);
+                break;
+        }
 
 #ifdef GGML_CUDA_DMMV_F16
-    if (src1_convert_f16) {
-        ggml_cuda_pool_free(src1_dfloat, ash);
-    }
+        if (src1_convert_f16) {
+            ggml_cuda_pool_free(src1_dfloat, ash);
+        }
 #endif // GGML_CUDA_DMMV_F16
+    }
 
     (void) src1;
     (void) dst;
@@ -2195,14 +2950,21 @@ inline void ggml_cuda_op_rope(
     const int n_past = ((int32_t *) src1->data)[0];
     const int n_dims = ((int32_t *) src1->data)[1];
     const int mode   = ((int32_t *) src1->data)[2];
-    GGML_ASSERT(mode == 0);
+    const int n_ctx  = ((int32_t *) src1->data)[3];
 
     const float theta_scale = powf(10000.0, -2.0f/n_dims);
     const float p = ((mode & 1) == 0 ? n_past + i02 : i02);
 
+    bool is_glm = mode & 4;
+
     // compute
-    rope_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, p, theta_scale, cudaStream_main);
-    CUDA_CHECK(cudaGetLastError());
+    if (is_glm) {
+        const float id_p = min(p, n_ctx - 2.f);
+        const float block_p = max(p - (n_ctx - 2.f), 0.f);
+        rope_glm_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, id_p, block_p, theta_scale, cudaStream_main);
+    } else {
+        rope_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, p, theta_scale, cudaStream_main);
+    }
 
     (void) dst;
     (void) src0_ddq_i;
@@ -2226,7 +2988,6 @@ inline void ggml_cuda_op_diag_mask_inf(
 
     // compute
     diag_mask_inf_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, ne01, n_past, cudaStream_main);
-    CUDA_CHECK(cudaGetLastError());
 
     (void) dst;
     (void) src0_ddq_i;
@@ -2248,7 +3009,6 @@ inline void ggml_cuda_op_soft_max(
 
     // compute
     soft_max_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, cudaStream_main);
-    CUDA_CHECK(cudaGetLastError());
 
     (void) src1;
     (void) dst;
@@ -2344,10 +3104,11 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
     size_t src1_asf[GGML_CUDA_MAX_DEVICES] = {0};
     size_t  dst_asf[GGML_CUDA_MAX_DEVICES] = {0};
 
-    // if multiple GPUs are used they need to wait for the main GPU to finish
+    // if multiple devices are used they need to wait for the main device
+    // here an event is recorded that signifies that the main device has finished calculating the input data
     if (split && g_device_count > 1) {
         CUDA_CHECK(cudaSetDevice(g_main_device));
-        CUDA_CHECK(cudaDeviceSynchronize());
+        CUDA_CHECK(cudaEventRecord(src0_extra->events[g_main_device], g_cudaStreams_main[g_main_device]));
     }
 
     for (int id = 0; id < g_device_count; ++id) {
@@ -2373,6 +3134,12 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
         int64_t row_diff = row_high - row_low;
 
         cudaSetDevice(id);
+        cudaStream_t cudaStream_main = g_cudaStreams_main[id];
+
+        // wait for main GPU data if necessary
+        if (split && id != g_main_device) {
+            CUDA_CHECK(cudaStreamWaitEvent(cudaStream_main, src0_extra->events[g_main_device]));
+        }
 
         if (src0_on_device && src0_is_contiguous) {
             if (src0_is_f32) {
@@ -2448,8 +3215,6 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
                 }
                 const int64_t i11 = i13*ne12 + i12;
 
-                cudaStream_t cudaStream_main = g_cudaStreams_main[id];
-
                 // for split tensors the data begins at i0 == i0_offset_low
                 char  * src0_ddq_i = src0_ddq[id] + (i0 - i0_offset_low)*src0_stride*src0_ts/src0_bs;
                 float * src0_ddf_i = src0_ddf[id] + (i0 - i0_offset_low)*src0_stride;
@@ -2509,6 +3274,7 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
 
                 // do the computation
                 op(src0, src1, dst, src0_ddq_i, src0_ddf_i, src1_ddf_i, dst_ddf_i, i02, i01_low, i01_high, i11, cudaStream_main);
+                CUDA_CHECK(cudaGetLastError());
 
                 // copy dst to host or other device if necessary
                 if (!dst_on_device) {
@@ -2538,6 +3304,11 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
                         CUDA_CHECK(cudaMemcpyAsync(dhf_dst_i, dst_ddf_i, dst_stride*sizeof(float), kind, cudaStream_main));
                     }
                 }
+
+                // signify to main device that other device is done
+                if (split && g_device_count > 1 && id != g_main_device) {
+                    CUDA_CHECK(cudaEventRecord(src0_extra->events[id], cudaStream_main));
+                }
             }
         }
     }
@@ -2549,7 +3320,6 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
         }
 
         CUDA_CHECK(cudaSetDevice(id));
-        CUDA_CHECK(cudaDeviceSynchronize());
 
         if (src0_asq[id] > 0) {
             ggml_cuda_pool_free(src0_ddq[id], src0_asq[id]);
@@ -2564,6 +3334,21 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
             ggml_cuda_pool_free(dst_ddf[id], dst_asf[id]);
         }
     }
+
+    // main device waits for all other devices to be finished
+    if (split && g_device_count > 1) {
+        CUDA_CHECK(cudaSetDevice(g_main_device));
+        for (int id = 0; id < g_device_count; ++id) {
+            if (id != g_main_device) {
+                CUDA_CHECK(cudaStreamWaitEvent(g_cudaStreams_main[g_main_device], src0_extra->events[id]));
+            }
+        }
+    }
+
+    if (dst->backend == GGML_BACKEND_CPU) {
+        CUDA_CHECK(cudaSetDevice(g_main_device));
+        CUDA_CHECK(cudaDeviceSynchronize());
+    }
 }
 
 void ggml_cuda_add(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@@ -2582,11 +3367,21 @@ void ggml_cuda_mul(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tens
     ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul, true, false); // TODO ggml_cuda_op needs modification for flatten
 }
 
+void ggml_cuda_gelu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
+    ggml_cuda_op(src0, src1, dst, ggml_cuda_op_gelu, true, true);
+}
+
 void ggml_cuda_silu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
     GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
     ggml_cuda_op(src0, src1, dst, ggml_cuda_op_silu, true, true);
 }
 
+void ggml_cuda_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
+    ggml_cuda_op(src0, src1, dst, ggml_cuda_op_norm, true, true);
+}
+
 void ggml_cuda_rms_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
     GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
     ggml_cuda_op(src0, src1, dst, ggml_cuda_op_rms_norm, true, true);
@@ -2679,8 +3474,8 @@ void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_
     }else if (src0->type == GGML_TYPE_F32) {
         ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, true, false);
     } else if (ggml_is_quantized(src0->type) || src0->type == GGML_TYPE_F16) {
-        if (src1->ne[1] == 1 && src0->ne[0] % GGML_CUDA_DMMV_X == 0 && src0->ne[1] % GGML_CUDA_DMMV_Y == 0) {
-            ggml_cuda_op(src0, src1, dst, ggml_cuda_op_dequantize_mul_mat_vec, false, false);
+        if (src1->ne[1] == 1 && src0->ne[0] % GGML_CUDA_DMMV_X == 0) {
+            ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul_mat_vec, false, false);
         } else {
             ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, true, false);
         }
@@ -2742,6 +3537,11 @@ void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tens
     (void) dst;
 }
 
+void ggml_cuda_dup(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    ggml_cuda_cpy(src0, dst, nullptr);
+    (void) src1;
+}
+
 void ggml_cuda_diag_mask_inf(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
     GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
     ggml_cuda_op(src0, src1, dst, ggml_cuda_op_diag_mask_inf, true, true);
@@ -2765,7 +3565,11 @@ void ggml_cuda_nop(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tens
 
 void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
     int nrows = ggml_nrows(tensor);
+
+    const int64_t ne0 = tensor->ne[0];
+
     const size_t nb1 = tensor->nb[1];
+
     ggml_backend backend = tensor->backend;
     struct ggml_tensor_extra_gpu * extra = new struct ggml_tensor_extra_gpu;
     memset(extra, 0, sizeof(*extra));
@@ -2794,76 +3598,113 @@ void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
         int64_t nrows_split = row_high - row_low;
 
         const size_t offset_split = row_low*nb1;
-        const size_t size = ggml_nbytes_split(tensor, nrows_split);
+        size_t size = ggml_nbytes_split(tensor, nrows_split);
+        const size_t original_size = size;
 
-        void * buf;
+        // pad last row to a multiple of 256 elements to avoid out-of-bounds memory accesses
+        if (ne0 % MATRIX_ROW_PADDING != 0) {
+            size += (MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING)
+                * ggml_type_size(tensor->type)/ggml_blck_size(tensor->type);
+        }
+
+        char * buf;
         CUDA_CHECK(cudaMalloc(&buf, size));
-        void * buf_host = (char*)data + offset_split;
+        char * buf_host = (char*)data + offset_split;
 
-        cudaMemcpy(buf, buf_host, size, cudaMemcpyHostToDevice);
+        // set padding to 0 to avoid possible NaN values
+        if (size > original_size) {
+            CUDA_CHECK(cudaMemset(buf + original_size, 0, size - original_size));
+        }
+
+
+        CUDA_CHECK(cudaMemcpy(buf, buf_host, size, cudaMemcpyHostToDevice));
 
         extra->data_device[id] = buf;
+
+        if (backend == GGML_BACKEND_GPU_SPLIT) {
+            CUDA_CHECK(cudaEventCreateWithFlags(&extra->events[id], cudaEventDisableTiming));
+        }
     }
 
     tensor->extra = extra;
 }
 
 void ggml_cuda_free_data(struct ggml_tensor * tensor) {
-    if (tensor->backend != GGML_BACKEND_GPU && tensor->backend != GGML_BACKEND_GPU_SPLIT) {
+    if (!tensor || (tensor->backend != GGML_BACKEND_GPU && tensor->backend != GGML_BACKEND_GPU_SPLIT) ) {
         return;
     }
 
     ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
 
     for (int id = 0; id < g_device_count; ++id) {
-        if (extra->data_device[id] == nullptr) {
-            continue;
+        if (extra->data_device[id] != nullptr) {
+            CUDA_CHECK(cudaSetDevice(id));
+            CUDA_CHECK(cudaFree(extra->data_device[id]));
         }
 
-        CUDA_CHECK(cudaSetDevice(id));
-        CUDA_CHECK(cudaFree(extra->data_device[id]));
+        if (extra->events[id] != nullptr) {
+            CUDA_CHECK(cudaSetDevice(id));
+            CUDA_CHECK(cudaEventDestroy(extra->events[id]));
+        }
     }
 
     delete extra;
 }
 
+static struct ggml_tensor_extra_gpu * g_temp_tensor_extras = nullptr;
+static size_t g_temp_tensor_extra_index = 0;
+
+static struct ggml_tensor_extra_gpu * ggml_cuda_alloc_temp_tensor_extra() {
+    if (g_temp_tensor_extras == nullptr) {
+        g_temp_tensor_extras = new ggml_tensor_extra_gpu[GGML_MAX_NODES];
+    }
+
+    size_t alloc_index = g_temp_tensor_extra_index;
+    g_temp_tensor_extra_index = (g_temp_tensor_extra_index + 1) % GGML_MAX_NODES;
+    struct ggml_tensor_extra_gpu * extra = &g_temp_tensor_extras[alloc_index];
+    memset(extra, 0, sizeof(*extra));
+
+    return extra;
+}
+
 void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bool force_inplace) {
     if (scratch && g_scratch_size == 0) {
         return;
     }
 
     // recursively assign CUDA buffers until a compute tensor is found
-    if (tensor->src0 != nullptr && tensor->src0->backend == GGML_BACKEND_CPU) {
-        const ggml_op src0_op = tensor->src0->op;
-        if (src0_op == GGML_OP_RESHAPE || src0_op == GGML_OP_TRANSPOSE || src0_op == GGML_OP_VIEW) {
-            ggml_cuda_assign_buffers_impl(tensor->src0, scratch, force_inplace);
+    if (tensor->src[0] != nullptr && tensor->src[0]->backend == GGML_BACKEND_CPU) {
+        const ggml_op src0_op = tensor->src[0]->op;
+        if (src0_op == GGML_OP_RESHAPE || src0_op == GGML_OP_TRANSPOSE || src0_op == GGML_OP_VIEW || src0_op == GGML_OP_PERMUTE) {
+            ggml_cuda_assign_buffers_impl(tensor->src[0], scratch, force_inplace);
         }
     }
-    if (tensor->op == GGML_OP_CPY && tensor->src1->backend == GGML_BACKEND_CPU) {
-        ggml_cuda_assign_buffers_impl(tensor->src1, scratch, force_inplace);
+    if (tensor->op == GGML_OP_CPY && tensor->src[1]->backend == GGML_BACKEND_CPU) {
+        ggml_cuda_assign_buffers_impl(tensor->src[1], scratch, force_inplace);
     }
 
     tensor->backend = GGML_BACKEND_GPU;
-    struct ggml_tensor_extra_gpu * extra = new ggml_tensor_extra_gpu;
-    memset(extra, 0, sizeof(*extra));
+    struct ggml_tensor_extra_gpu * extra;
 
-    const bool inplace = (tensor->src0 != nullptr && tensor->src0->data == tensor->data) ||
+    const bool inplace = (tensor->src[0] != nullptr && tensor->src[0]->data == tensor->data) ||
         tensor->op == GGML_OP_VIEW ||
         force_inplace;
     const size_t size = ggml_nbytes(tensor);
 
     CUDA_CHECK(cudaSetDevice(g_main_device));
-    if (inplace && (tensor->src0->backend == GGML_BACKEND_GPU || tensor->src0->backend == GGML_BACKEND_GPU_SPLIT)) {
-        struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->src0->extra;
+    if (inplace && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT)) {
+        struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->src[0]->extra;
         char * src0_ddc = (char *) src0_extra->data_device[g_main_device];
         size_t offset = 0;
         if (tensor->op == GGML_OP_VIEW) {
-            memcpy(&offset, tensor->opt[0]->data, sizeof(size_t));
+            memcpy(&offset, tensor->src[2]->data, sizeof(size_t));
         }
+        extra = ggml_cuda_alloc_temp_tensor_extra();
         extra->data_device[g_main_device] = src0_ddc + offset;
     } else if (tensor->op == GGML_OP_CPY) {
-        struct ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu * ) tensor->src1->extra;
+        struct ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu * ) tensor->src[1]->extra;
         void * src1_ddv = src1_extra->data_device[g_main_device];
+        extra = ggml_cuda_alloc_temp_tensor_extra();
         extra->data_device[g_main_device] = src1_ddv;
     } else if (scratch) {
         GGML_ASSERT(size <= g_scratch_size);
@@ -2876,6 +3717,7 @@ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bo
             CUDA_CHECK(cudaMalloc(&data, g_scratch_size));
             g_scratch_buffer = data;
         }
+        extra = ggml_cuda_alloc_temp_tensor_extra();
         extra->data_device[g_main_device] = data + g_scratch_offset;
 
         g_scratch_offset += size;
@@ -2885,6 +3727,8 @@ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bo
         void * data;
         CUDA_CHECK(cudaMalloc(&data, size));
         CUDA_CHECK(cudaMemset(data, 0, size));
+        extra = new ggml_tensor_extra_gpu;
+        memset(extra, 0, sizeof(*extra));
         extra->data_device[g_main_device] = data;
     }
 
@@ -2933,10 +3777,16 @@ void ggml_cuda_free_scratch() {
 bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor){
     ggml_cuda_func_t func;
     const bool any_on_device = tensor->backend == GGML_BACKEND_GPU
-        || (tensor->src0 != nullptr && (tensor->src0->backend == GGML_BACKEND_GPU || tensor->src0->backend == GGML_BACKEND_GPU_SPLIT))
-        || (tensor->src1 != nullptr && tensor->src1->backend == GGML_BACKEND_GPU);
+        || (tensor->src[0] != nullptr && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT))
+        || (tensor->src[1] != nullptr && tensor->src[1]->backend == GGML_BACKEND_GPU);
 
     switch (tensor->op) {
+        case GGML_OP_DUP:
+            if (!any_on_device) {
+                return false;
+            }
+            func = ggml_cuda_dup;
+            break;
         case GGML_OP_ADD:
             if (!any_on_device) {
                 return false;
@@ -2949,12 +3799,24 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
             }
             func = ggml_cuda_mul;
             break;
+        case GGML_OP_GELU:
+            if (!any_on_device) {
+                return false;
+            }
+            func = ggml_cuda_gelu;
+            break;
         case GGML_OP_SILU:
             if (!any_on_device) {
                 return false;
             }
             func = ggml_cuda_silu;
             break;
+        case GGML_OP_NORM:
+            if (!any_on_device) {
+                return false;
+            }
+            func = ggml_cuda_norm;
+            break;
         case GGML_OP_RMS_NORM:
             if (!any_on_device) {
                 return false;
@@ -2962,7 +3824,7 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
             func = ggml_cuda_rms_norm;
             break;
         case GGML_OP_MUL_MAT:
-            if (!any_on_device && !ggml_cuda_can_mul_mat(tensor->src0, tensor->src1, tensor)) {
+            if (!any_on_device && !ggml_cuda_can_mul_mat(tensor->src[0], tensor->src[1], tensor)) {
                 return false;
             }
             func = ggml_cuda_mul_mat;
@@ -2979,6 +3841,12 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
             }
             func = ggml_cuda_cpy;
             break;
+        case GGML_OP_CONT:
+            if (!any_on_device) {
+                return false;
+            }
+            func = ggml_cuda_dup;
+            break;
         case GGML_OP_RESHAPE:
         case GGML_OP_VIEW:
         case GGML_OP_PERMUTE:
@@ -3016,6 +3884,6 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
     if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
         return true;
     }
-    func(tensor->src0, tensor->src1, tensor);
+    func(tensor->src[0], tensor->src[1], tensor);
     return true;
 }
diff --git a/ggml-cuda.h b/ggml-cuda.h
index 7a65a3558..3c1e8deb6 100644
--- a/ggml-cuda.h
+++ b/ggml-cuda.h
@@ -8,10 +8,6 @@ extern "C" {
 
 #define GGML_CUDA_MAX_DEVICES       16
 
-struct ggml_tensor_extra_gpu {
-    void * data_device[GGML_CUDA_MAX_DEVICES]; // 1 pointer for each device for split tensors
-};
-
 void   ggml_init_cublas(void);
 void   ggml_cuda_set_tensor_split(const float * tensor_split);
 
diff --git a/ggml-metal.h b/ggml-metal.h
index b9e50ac74..928f1705c 100644
--- a/ggml-metal.h
+++ b/ggml-metal.h
@@ -34,9 +34,13 @@ extern "C" {
 
 struct ggml_metal_context;
 
-struct ggml_metal_context * ggml_metal_init(void);
+// number of command buffers to use
+struct ggml_metal_context * ggml_metal_init(int n_cb);
 void ggml_metal_free(struct ggml_metal_context * ctx);
 
+// set the number of command buffers to use
+void ggml_metal_set_n_cb(struct ggml_metal_context * ctx, int n_cb);
+
 // creates a mapping between a host memory buffer and a device memory buffer
 // - make sure to map all buffers used in the graph before calling ggml_metal_graph_compute
 // - the mapping is used during computation to determine the arguments of the compute kernels
diff --git a/ggml-metal.m b/ggml-metal.m
index 7551231b9..ee205bcdf 100644
--- a/ggml-metal.m
+++ b/ggml-metal.m
@@ -25,6 +25,8 @@ struct ggml_metal_buffer {
 };
 
 struct ggml_metal_context {
+    int n_cb;
+
     float * logits;
 
     id<MTLDevice>       device;
@@ -86,11 +88,12 @@ static NSString * const msl_library_source = @"see metal.metal";
 @implementation GGMLMetalClass
 @end
 
-struct ggml_metal_context * ggml_metal_init(void) {
+struct ggml_metal_context * ggml_metal_init(int n_cb) {
     fprintf(stderr, "%s: allocating\n", __func__);
 
     struct ggml_metal_context * ctx = malloc(sizeof(struct ggml_metal_context));
 
+    ctx->n_cb   = n_cb;
     ctx->device = MTLCreateSystemDefaultDevice();
     ctx->queue  = [ctx->device newCommandQueue];
     ctx->n_buffers = 0;
@@ -202,10 +205,16 @@ struct ggml_metal_context * ggml_metal_init(void) {
 
 void ggml_metal_free(struct ggml_metal_context * ctx) {
     fprintf(stderr, "%s: deallocating\n", __func__);
-
+    for (int i = 0; i < ctx->n_buffers; ++i) {
+        [ctx->buffers[i].metal release];
+    }
     free(ctx);
 }
 
+void ggml_metal_set_n_cb(struct ggml_metal_context * ctx, int n_cb) {
+    ctx->n_cb = n_cb;
+}
+
 // finds the Metal buffer that contains the tensor data on the GPU device
 // the assumption is that there is 1-to-1 mapping between the host and device memory buffers, so we can find the
 // Metal buffer based on the host memory pointer
@@ -352,7 +361,7 @@ void ggml_metal_graph_compute(
     // create multiple command buffers and enqueue them
     // then, we encode the graph into the command buffers in parallel
 
-    const int n_cb = gf->n_threads;
+    const int n_cb = ctx->n_cb;
 
     NSMutableArray * command_buffers = [NSMutableArray arrayWithCapacity:n_cb];
 
@@ -384,8 +393,8 @@ void ggml_metal_graph_compute(
             for (int i = node_start; i < node_end; ++i) {
                 metal_printf("%s: encoding node %3d, op = %8s\n", __func__, i, ggml_op_name(gf->nodes[i]->op));
 
-                struct ggml_tensor * src0 = gf->nodes[i]->src0;
-                struct ggml_tensor * src1 = gf->nodes[i]->src1;
+                struct ggml_tensor * src0 = gf->nodes[i]->src[0];
+                struct ggml_tensor * src1 = gf->nodes[i]->src[1];
                 struct ggml_tensor * dst  = gf->nodes[i];
 
                 const int64_t  ne00 = src0 ? src0->ne[0] : 0;
@@ -441,6 +450,7 @@ void ggml_metal_graph_compute(
                 //}
 
                 switch (dst->op) {
+                    case GGML_OP_NONE:
                     case GGML_OP_RESHAPE:
                     case GGML_OP_VIEW:
                     case GGML_OP_TRANSPOSE:
@@ -730,8 +740,7 @@ void ggml_metal_graph_compute(
                                 [encoder setBytes:&ne1  length:sizeof(ne1)  atIndex:14];
 
                                 if (src0t == GGML_TYPE_Q4_0 || src0t == GGML_TYPE_Q4_1) {
-                                    [encoder setThreadgroupMemoryLength:nth0*nth1*sizeof(float) atIndex:0];
-                                    [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne11, 1) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
+                                    [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7) / 8, ne11, 1) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
                                 }
                                 else if (src0t == GGML_TYPE_Q2_K ||
                                          src0t == GGML_TYPE_Q3_K ||
@@ -872,28 +881,35 @@ void ggml_metal_graph_compute(
 
                             const int n_past = ((int32_t *)(src1->data))[0];
 
+                            float freq_base;
+                            float freq_scale;
+                            memcpy(&freq_base,  (int32_t *) src1->data + 4, sizeof(float));
+                            memcpy(&freq_scale, (int32_t *) src1->data + 5, sizeof(float));
+
                             [encoder setComputePipelineState:ctx->pipeline_rope];
                             [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
                             [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
-                            [encoder setBytes:&ne00   length:sizeof( int64_t) atIndex:2];
-                            [encoder setBytes:&ne01   length:sizeof( int64_t) atIndex:3];
-                            [encoder setBytes:&ne02   length:sizeof( int64_t) atIndex:4];
-                            [encoder setBytes:&ne03   length:sizeof( int64_t) atIndex:5];
-                            [encoder setBytes:&nb00   length:sizeof(uint64_t) atIndex:6];
-                            [encoder setBytes:&nb01   length:sizeof(uint64_t) atIndex:7];
-                            [encoder setBytes:&nb02   length:sizeof(uint64_t) atIndex:8];
-                            [encoder setBytes:&nb03   length:sizeof(uint64_t) atIndex:9];
-                            [encoder setBytes:&ne0    length:sizeof( int64_t) atIndex:10];
-                            [encoder setBytes:&ne1    length:sizeof( int64_t) atIndex:11];
-                            [encoder setBytes:&ne2    length:sizeof( int64_t) atIndex:12];
-                            [encoder setBytes:&ne3    length:sizeof( int64_t) atIndex:13];
-                            [encoder setBytes:&nb0    length:sizeof(uint64_t) atIndex:14];
-                            [encoder setBytes:&nb1    length:sizeof(uint64_t) atIndex:15];
-                            [encoder setBytes:&nb2    length:sizeof(uint64_t) atIndex:16];
-                            [encoder setBytes:&nb3    length:sizeof(uint64_t) atIndex:17];
-                            [encoder setBytes:&n_past length:sizeof(     int) atIndex:18];
-                            [encoder setBytes:&n_dims length:sizeof(     int) atIndex:19];
-                            [encoder setBytes:&mode   length:sizeof(     int) atIndex:20];
+                            [encoder setBytes:&ne00    length:sizeof( int64_t) atIndex:2];
+                            [encoder setBytes:&ne01    length:sizeof( int64_t) atIndex:3];
+                            [encoder setBytes:&ne02    length:sizeof( int64_t) atIndex:4];
+                            [encoder setBytes:&ne03    length:sizeof( int64_t) atIndex:5];
+                            [encoder setBytes:&nb00    length:sizeof(uint64_t) atIndex:6];
+                            [encoder setBytes:&nb01    length:sizeof(uint64_t) atIndex:7];
+                            [encoder setBytes:&nb02    length:sizeof(uint64_t) atIndex:8];
+                            [encoder setBytes:&nb03    length:sizeof(uint64_t) atIndex:9];
+                            [encoder setBytes:&ne0     length:sizeof( int64_t) atIndex:10];
+                            [encoder setBytes:&ne1     length:sizeof( int64_t) atIndex:11];
+                            [encoder setBytes:&ne2     length:sizeof( int64_t) atIndex:12];
+                            [encoder setBytes:&ne3     length:sizeof( int64_t) atIndex:13];
+                            [encoder setBytes:&nb0     length:sizeof(uint64_t) atIndex:14];
+                            [encoder setBytes:&nb1     length:sizeof(uint64_t) atIndex:15];
+                            [encoder setBytes:&nb2     length:sizeof(uint64_t) atIndex:16];
+                            [encoder setBytes:&nb3     length:sizeof(uint64_t) atIndex:17];
+                            [encoder setBytes:&n_past  length:sizeof(     int) atIndex:18];
+                            [encoder setBytes:&n_dims  length:sizeof(     int) atIndex:19];
+                            [encoder setBytes:&mode    length:sizeof(     int) atIndex:20];
+                            [encoder setBytes:&freq_base  length:sizeof(float) atIndex:21];
+                            [encoder setBytes:&freq_scale length:sizeof(float) atIndex:22];
 
                             [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
                         } break;
diff --git a/ggml-metal.metal b/ggml-metal.metal
index e62fe6842..9f9a4fbd7 100644
--- a/ggml-metal.metal
+++ b/ggml-metal.metal
@@ -365,6 +365,10 @@ kernel void kernel_rms_norm(
     }
 }
 
+// putting them in the kernel cause a significant performance penalty
+#define N_DST 4 // each SIMD group works on 4 rows
+#define N_SIMDGROUP 2 // number of SIMD groups in a thread group
+#define N_SIMDWIDTH 32 // assuming SIMD group size is 32
 kernel void kernel_mul_mat_q4_0_f32(
         device const  void * src0,
         device const float * src1,
@@ -372,64 +376,83 @@ kernel void kernel_mul_mat_q4_0_f32(
         constant   int64_t & ne00,
         constant   int64_t & ne10,
         constant   int64_t & ne0,
-        threadgroup float  * sum [[threadgroup(0)]],
+        constant   int64_t & ne01[[buffer(4)]],
         uint2 tgpig[[threadgroup_position_in_grid]],
-        uint2 tpitg[[thread_position_in_threadgroup]],
-        uint2  tptg[[threads_per_threadgroup]]) {
+        uint tiisg[[thread_index_in_simdgroup]],
+        uint sgitg[[simdgroup_index_in_threadgroup]]) {
     const int nb = ne00/QK4_0;
-
-    const int64_t r0 = tgpig.x;
-    const int64_t r1 = tgpig.y;
-
-    device const block_q4_0 * x = (device const block_q4_0 *) src0 + r0*nb;
+    const int r0 = tgpig.x;
+    const int r1 = tgpig.y;
+    device const block_q4_0 * x = (device const block_q4_0 *) src0 + (r0 * N_SIMDGROUP + sgitg) * N_DST * nb;
     device const float      * y = (device const float      *) src1 + r1*ne10;
+    block_q4_0 qb_curr, qb_next;
+    float4 y_curr[8];       // src1 vector cache
+    float sumf[N_DST]={0.f}, all_sum;
+    thread float * yl=(thread float *)y_curr;
 
-    const int nth = tptg.x*tptg.y;
-    const int ith = tptg.y*tpitg.x + tpitg.y;
-
-    const int ix = tpitg.y/4;           // 0 or 1
-    const int iy = tpitg.y - 4*ix;      // 0...3
-
-    const int first = 4 * iy;
-
-    float sumf = 0;
-
-    for (int i = 2*tpitg.x + ix; i < nb; i += 2*tptg.x) {
-
-        const float d = (float)x[i].d;
-
-        device const uint8_t * xl = x[i].qs + first;
-        device const float   * yl = y + i * QK4_0 + first;
-
-        float2 acc = {0.0f, 0.0f};
-
-        for (int j = 0; j < 4; ++j) {
-
-            acc[0] += yl[j] * (xl[j] & 0xF) + yl[j+16] * (xl[j] >> 4);
-            acc[1] += yl[j] + yl[j+16];
+    // bootstrap
+    qb_curr = x[tiisg];
+    // each thread in a SIMD group deals with 1 block.
+    for (int column = 0; column < nb / N_SIMDWIDTH; column++) {
 
+        float sumy = 0;
+        for (int i = 0; i < QK4_0 / 4; i++) {
+            y_curr[i] = *((device float4  *)(y + N_SIMDWIDTH * (tiisg + column * QK4_0) + 4 * i));
+            sumy += y_curr[i][0] + y_curr[i][1] + y_curr[i][2] + y_curr[i][3];
         }
+        sumy *= (-8.f);
 
-        sumf += d * (acc[0] - 8.f*acc[1]);
+        for (int row = 0; row < N_DST; row++) {
+            // prefetch next x block
+            qb_next = x[tiisg + ((row + 1) % N_DST) * nb + (column + ((row + 1) / N_DST)) * N_SIMDWIDTH];
+
+            // calculate
+            float d = qb_curr.d;
+            float acc = sumy;
+            for (int i = 0; i < 16; i++) {
+                acc += yl[i] * (qb_curr.qs[i] & 0xF) + yl[i+16] * (qb_curr.qs[i] >> 4);
+            }
+            sumf[row] += d * acc;
+            qb_curr = qb_next;
+        }
     }
 
-    sum[ith] = sumf;
+    if (nb % N_SIMDWIDTH == 0) {
+        for (int row = 0; row < N_DST; ++row) {
+            all_sum = simd_sum(sumf[row]);
+            if (tiisg == 0 && ((r0 * N_SIMDGROUP + sgitg) * N_DST + row) < ne01) {
+                dst[r1*ne0 + (r0 * N_SIMDGROUP + sgitg) * N_DST + row] = all_sum;
+            }
+        }
+    } else {
 
-    //
-    // Accumulate the sum from all threads in the threadgroup
-    //
-    threadgroup_barrier(mem_flags::mem_threadgroup);
-    if (ith%4 == 0) {
-        sum[ith] += sum[ith+1] + sum[ith+2] + sum[ith+3];
-    }
-    threadgroup_barrier(mem_flags::mem_threadgroup);
-    if (ith%16 == 0) {
-        sum[ith] += sum[ith+4] + sum[ith+8] + sum[ith+12];
-    }
-    threadgroup_barrier(mem_flags::mem_threadgroup);
-    if (ith == 0) {
-        for (int i = 16; i < nth; i += 16) sum[0] += sum[i];
-        dst[r1*ne0 + r0] = sum[0];
+        float sumy = 0;
+        for (int i = 0; i < QK4_0 / 4; i++) {
+            y_curr[i] = *((device float4 *)(y + N_SIMDWIDTH * (tiisg + (nb / N_SIMDWIDTH) * QK4_0) + 4 * i));
+            sumy += y_curr[i][0] + y_curr[i][1] + y_curr[i][2] + y_curr[i][3];
+        }
+        sumy *= (-8.f);
+
+        for (int row = 0; row < N_DST; row++) {
+            // prefetch next x block
+            qb_next = x[tiisg + ((row + 1) % N_DST) * nb + (nb / N_SIMDWIDTH + ((row + 1) / N_DST)) * N_SIMDWIDTH];
+
+            // calculate
+            float d = qb_curr.d;
+            float acc = sumy;
+            for (int i = 0; i < 16; i++) {
+                acc += yl[i] * (qb_curr.qs[i] & 0xF) + yl[i+16] * (qb_curr.qs[i] >> 4);
+            }
+            if (tiisg < nb % N_SIMDWIDTH) {
+                sumf[row] += d * acc;
+            }
+            qb_curr = qb_next;
+
+            all_sum = simd_sum(sumf[row]);
+            if (tiisg == 0 && ((r0 * N_SIMDGROUP + sgitg) * N_DST + row) < ne01) {
+                dst[r1*ne0 + (r0 * N_SIMDGROUP + sgitg) * N_DST + row] = all_sum;
+            }
+        }
     }
 }
 
@@ -440,65 +463,83 @@ kernel void kernel_mul_mat_q4_1_f32(
         constant   int64_t & ne00,
         constant   int64_t & ne10,
         constant   int64_t & ne0,
-        threadgroup float  * sum [[threadgroup(0)]],
+        constant   int64_t & ne01[[buffer(4)]],
         uint2 tgpig[[threadgroup_position_in_grid]],
-        uint2 tpitg[[thread_position_in_threadgroup]],
-        uint2  tptg[[threads_per_threadgroup]]) {
-    const int nb = ne00/QK4_1;
-
-    const int64_t r0 = tgpig.x;
-    const int64_t r1 = tgpig.y;
-
-    device const block_q4_1 * x = (device const block_q4_1 *) src0 + r0*nb;
+        uint tiisg[[thread_index_in_simdgroup]],
+        uint sgitg[[simdgroup_index_in_threadgroup]]) {
+    const int nb = ne00/QK4_0;
+    const int r0 = tgpig.x;
+    const int r1 = tgpig.y;
+    device const block_q4_1 * x = (device const block_q4_1 *) src0 + (r0 * N_SIMDGROUP + sgitg) * N_DST * nb;
     device const float      * y = (device const float      *) src1 + r1*ne10;
+    block_q4_1 qb_curr, qb_next;
+    float4 y_curr[8];       // src1 vector cache
+    float sumf[N_DST]={0.f}, all_sum;
+    thread float * yl=(thread float *)y_curr;
 
-    const uint nth = tptg.x*tptg.y;
-    const uint ith = tptg.y*tpitg.x + tpitg.y;
-
-    const int ix = tpitg.y/4;           // 0 or 1
-    const int iy = tpitg.y - 4*ix;      // 0...3
-
-    const int first = 4 * iy;
-
-    float sumf = 0;
-
-    for (int i = 2*tpitg.x + ix; i < nb; i += 2*tptg.x) {
-
-        const float d = (float)x[i].d;
-        const float m = (float)x[i].m;
-
-        device const uint8_t * xl = x[i].qs + first;
-        device const float   * yl = y + i * QK4_1 + first;
-
-        float2 acc = {0.0f, 0.0f};
-
-        for (int j = 0; j < 4; ++j) {
-
-            acc[0] += yl[j+ 0] * (d * (xl[j] & 0xF) + m);
-            acc[1] += yl[j+16] * (d * (xl[j] >>  4) + m);
+    // bootstrap
+    qb_curr = x[tiisg];
+    // each thread in a SIMD group deals with 1 block.
+    for (int column = 0; column < nb / N_SIMDWIDTH; column++) {
 
+        float sumy = 0;
+        for (int i = 0; i < QK4_0 / 4; i++) {
+            y_curr[i] = *((device float4  *)(y + N_SIMDWIDTH * (tiisg + column * QK4_0) + 4 * i));
+            sumy += y_curr[i][0] + y_curr[i][1] + y_curr[i][2] + y_curr[i][3];
         }
 
-        sumf += acc[0] + acc[1];
+        for (int row = 0; row < N_DST; row++) {
+            // prefetch next x block
+            qb_next = x[tiisg + ((row + 1) % N_DST) * nb + (column + ((row + 1) / N_DST)) * N_SIMDWIDTH];
+
+            // calculate
+            const float d = qb_curr.d;
+            const float m = qb_curr.m;
+            float acc = 0.f;
+            for (int i = 0; i < 16; i++) {
+                acc += yl[i] * (qb_curr.qs[i] & 0xF) + yl[i+16] * (qb_curr.qs[i] >> 4);
+            }
+            sumf[row] += d * acc + m * sumy;
+            qb_curr = qb_next;
+        }
     }
 
-    sum[ith] = sumf;
+    if (nb % N_SIMDWIDTH == 0) {
+        for (int row = 0; row < N_DST; ++row) {
+            all_sum = simd_sum(sumf[row]);
+            if (tiisg == 0 && ((r0 * N_SIMDGROUP + sgitg) * N_DST + row) < ne01) {
+                dst[r1*ne0 + (r0 * N_SIMDGROUP + sgitg) * N_DST + row] = all_sum;
+            }
+        }
+    } else {
 
-    //
-    // Accumulate the sum from all threads in the threadgroup
-    //
-    threadgroup_barrier(mem_flags::mem_threadgroup);
-    if (ith%4 == 0) {
-        sum[ith] += sum[ith+1] + sum[ith+2] + sum[ith+3];
-    }
-    threadgroup_barrier(mem_flags::mem_threadgroup);
-    if (ith%16 == 0) {
-        sum[ith] += sum[ith+4] + sum[ith+8] + sum[ith+12];
-    }
-    threadgroup_barrier(mem_flags::mem_threadgroup);
-    if (ith == 0) {
-        for (uint i = 16; i < nth; i += 16) sum[0] += sum[i];
-        dst[r1*ne0 + r0] = sum[0];
+        float sumy = 0;
+        for (int i = 0; i < QK4_0 / 4; i++) {
+            y_curr[i] = *((device float4 *)(y + N_SIMDWIDTH * (tiisg + (nb / N_SIMDWIDTH) * QK4_0) + 4 * i));
+            sumy += y_curr[i][0] + y_curr[i][1] + y_curr[i][2] + y_curr[i][3];
+        }
+
+        for (int row = 0; row < N_DST; row++) {
+            // prefetch next x block
+            qb_next = x[tiisg + ((row + 1) % N_DST) * nb + (nb / N_SIMDWIDTH + ((row + 1) / N_DST)) * N_SIMDWIDTH];
+
+            // calculate
+            const float d = qb_curr.d;
+            const float m = qb_curr.m;
+            float acc = 0.f;
+            for (int i = 0; i < 16; i++) {
+                acc += yl[i] * (qb_curr.qs[i] & 0xF) + yl[i+16] * (qb_curr.qs[i] >> 4);
+            }
+            if (tiisg < nb % N_SIMDWIDTH) {
+                sumf[row] += d * acc + m * sumy;
+            }
+            qb_curr = qb_next;
+
+            all_sum = simd_sum(sumf[row]);
+            if (tiisg == 0 && ((r0 * N_SIMDGROUP + sgitg) * N_DST + row) < ne01) {
+                dst[r1*ne0 + (r0 * N_SIMDGROUP + sgitg) * N_DST + row] = all_sum;
+            }
+        }
     }
 }
 
@@ -615,17 +656,19 @@ kernel void kernel_rope(
         constant       int & n_past,
         constant       int & n_dims,
         constant       int & mode,
+        constant     float & freq_base,
+        constant     float & freq_scale,
         uint3 tpig[[thread_position_in_grid]]) {
     const int64_t i3 = tpig[2];
     const int64_t i2 = tpig[1];
     const int64_t i1 = tpig[0];
 
     const bool is_neox = mode & 2;
-    const float theta_scale = pow(10000.0, -2.0f/n_dims);
+    const float theta_scale = pow(freq_base, -2.0f/n_dims);
 
     const int64_t p = ((mode & 1) == 0 ? n_past + i2 : i2);
 
-    float theta = (float)p;
+    float theta = freq_scale * (float)p;
 
     if (!is_neox) {
         for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
diff --git a/ggml-mpi.c b/ggml-mpi.c
new file mode 100644
index 000000000..ae176d707
--- /dev/null
+++ b/ggml-mpi.c
@@ -0,0 +1,216 @@
+#include "ggml-mpi.h"
+
+#include "ggml.h"
+
+#include <mpi.h>
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#define MIN(a, b) ((a) < (b) ? (a) : (b))
+
+#define UNUSED GGML_UNUSED
+
+struct ggml_mpi_context {
+    int rank;
+    int size;
+};
+
+void ggml_mpi_backend_init(void) {
+    MPI_Init(NULL, NULL);
+}
+
+void ggml_mpi_backend_free(void) {
+    MPI_Finalize();
+}
+
+struct ggml_mpi_context * ggml_mpi_init(void) {
+    struct ggml_mpi_context * ctx = calloc(1, sizeof(struct ggml_mpi_context));
+
+    MPI_Comm_rank(MPI_COMM_WORLD, &ctx->rank);
+    MPI_Comm_size(MPI_COMM_WORLD, &ctx->size);
+
+    return ctx;
+}
+
+void ggml_mpi_free(struct ggml_mpi_context * ctx) {
+    free(ctx);
+}
+
+int ggml_mpi_rank(struct ggml_mpi_context * ctx) {
+    return ctx->rank;
+}
+
+void ggml_mpi_eval_init(
+        struct ggml_mpi_context * ctx_mpi,
+                            int * n_tokens,
+                            int * n_past,
+                            int * n_threads) {
+    UNUSED(ctx_mpi);
+
+    // synchronize the worker node parameters with the root node
+    MPI_Barrier(MPI_COMM_WORLD);
+
+    MPI_Bcast(n_tokens,  1, MPI_INT, 0, MPI_COMM_WORLD);
+    MPI_Bcast(n_past,    1, MPI_INT, 0, MPI_COMM_WORLD);
+    MPI_Bcast(n_threads, 1, MPI_INT, 0, MPI_COMM_WORLD);
+}
+
+static int ggml_graph_get_node_idx(struct ggml_cgraph * gf, const char * name) {
+    struct ggml_tensor * t = ggml_graph_get_tensor(gf, name);
+    if (t == NULL) {
+        fprintf(stderr, "%s: tensor %s not found\n", __func__, name);
+        return -1;
+    }
+
+    for (int i = 0; i < gf->n_nodes; i++) {
+        if (gf->nodes[i] == t) {
+            return i;
+        }
+    }
+
+    fprintf(stderr, "%s: tensor %s not found in graph (should not happen)\n", __func__, name);
+    return -1;
+}
+
+static void ggml_mpi_tensor_send(struct ggml_tensor * t, int mpi_rank_dst) {
+    MPI_Datatype mpi_type;
+
+    switch (t->type) {
+        case GGML_TYPE_I32: mpi_type = MPI_INT32_T; break;
+        case GGML_TYPE_F32: mpi_type = MPI_FLOAT;   break;
+        default: GGML_ASSERT(false && "not implemented");
+    }
+
+    const int retval = MPI_Send(t->data, ggml_nelements(t), mpi_type, mpi_rank_dst, 0, MPI_COMM_WORLD);
+    GGML_ASSERT(retval == MPI_SUCCESS);
+}
+
+static void ggml_mpi_tensor_recv(struct ggml_tensor * t, int mpi_rank_src) {
+    MPI_Datatype mpi_type;
+
+    switch (t->type) {
+        case GGML_TYPE_I32: mpi_type = MPI_INT32_T; break;
+        case GGML_TYPE_F32: mpi_type = MPI_FLOAT;   break;
+        default: GGML_ASSERT(false && "not implemented");
+    }
+
+    MPI_Status status; UNUSED(status);
+
+    const int retval = MPI_Recv(t->data, ggml_nelements(t), mpi_type, mpi_rank_src, MPI_ANY_TAG, MPI_COMM_WORLD, &status);
+    GGML_ASSERT(retval == MPI_SUCCESS);
+}
+
+// TODO: there are many improvements that can be done to this implementation
+void ggml_mpi_graph_compute_pre(
+        struct ggml_mpi_context * ctx_mpi,
+             struct ggml_cgraph * gf,
+                            int   n_layers) {
+    const int mpi_rank = ctx_mpi->rank;
+    const int mpi_size = ctx_mpi->size;
+
+    struct ggml_tensor * inp_tokens = ggml_graph_get_tensor(gf, "inp_tokens");
+    if (inp_tokens == NULL) {
+        fprintf(stderr, "%s: tensor 'inp_tokens' not found\n", __func__);
+        return;
+    }
+
+    struct ggml_tensor * inp0 = ggml_graph_get_tensor(gf, "layer_inp_0");
+    if (inp0 == NULL) {
+        fprintf(stderr, "%s: tensor 'inp0' not found\n", __func__);
+        return;
+    }
+
+    GGML_ASSERT(inp0 == gf->nodes[0]);
+
+    // distribute the compute graph into slices across the MPI nodes
+    //
+    // the main node (0) processes the last layers + the remainder of the compute graph
+    // and is responsible to pass the input tokens to the first node (1)
+    //
+    // node 1:   [(  0) * n_per_node, (  1) * n_per_node)
+    // node 2:   [(  1) * n_per_node, (  2) * n_per_node)
+    // ...
+    // node n-1: [(n-2) * n_per_node, (n-1) * n_per_node)
+    // node 0:   [(n-1) * n_per_node,            n_nodes)
+    //
+    if (mpi_rank > 0) {
+        if (mpi_rank == 1) {
+            // the first node (1) receives the input tokens from the main node (0)
+            ggml_mpi_tensor_recv(inp_tokens, 0);
+        } else {
+            // recv input data for each node into the "inp0" tensor (i.e. the first node in the compute graph)
+            ggml_mpi_tensor_recv(inp0, mpi_rank - 1);
+        }
+    } else if (mpi_size > 1) {
+        // node 0 sends the input tokens to node 1
+        ggml_mpi_tensor_send(inp_tokens, 1);
+
+        // recv the output data from the last node
+        ggml_mpi_tensor_recv(inp0, mpi_size - 1);
+    }
+
+    {
+        const int n_per_node = (n_layers + (mpi_size - 1)) / mpi_size;
+
+        const int mpi_idx = mpi_rank > 0 ? mpi_rank - 1 : mpi_size - 1;
+
+        const int il0 =               (mpi_idx + 0) * n_per_node;
+        const int il1 = MIN(n_layers, (mpi_idx + 1) * n_per_node);
+
+        char name_l0[GGML_MAX_NAME];
+        char name_l1[GGML_MAX_NAME];
+
+        snprintf(name_l0, sizeof(name_l0), "layer_inp_%d", il0);
+        snprintf(name_l1, sizeof(name_l1), "layer_inp_%d", il1);
+
+        const int idx_l0 =                ggml_graph_get_node_idx(gf, name_l0);
+        const int idx_l1 = mpi_rank > 0 ? ggml_graph_get_node_idx(gf, name_l1) + 1 : gf->n_nodes;
+
+        if (idx_l0 < 0 || idx_l1 < 0) {
+            fprintf(stderr, "%s: layer input nodes not found\n", __func__);
+            return;
+        }
+
+        // attach the input data to all nodes that need it
+        // TODO: not great - should be able to do this without modifying the compute graph (see next TODO below)
+        for (int i = idx_l0; i < idx_l1; i++) {
+            if (gf->nodes[i]->src[0] == gf->nodes[idx_l0]) {
+                gf->nodes[i]->src[0] =  inp0;
+            }
+            if (gf->nodes[i]->src[1] == gf->nodes[idx_l0]) {
+                gf->nodes[i]->src[1] =  inp0;
+            }
+        }
+
+        // TODO: instead of rearranging the nodes, we should be able to execute a subset of the compute graph
+        for (int i = 1; i < idx_l1 - idx_l0; i++) {
+            gf->nodes[i] = gf->nodes[idx_l0 + i];
+            gf->grads[i] = gf->grads[idx_l0 + i];
+        }
+
+        // the first node performs the "get_rows" operation, the rest of the nodes get the data from the previous node
+        if (mpi_idx != 0) {
+            gf->nodes[0]->op = GGML_OP_NONE;
+        }
+
+        gf->n_nodes = idx_l1 - idx_l0;
+
+        //fprintf(stderr, "%s: node %d: processing %d nodes [%d, %d)\n", __func__, mpi_rank, gf->n_nodes, il0, il1);
+    }
+}
+
+void ggml_mpi_graph_compute_post(
+        struct ggml_mpi_context * ctx_mpi,
+             struct ggml_cgraph * gf,
+                            int   n_layers) {
+    UNUSED(n_layers);
+
+    const int mpi_rank = ctx_mpi->rank;
+    const int mpi_size = ctx_mpi->size;
+
+    // send the output data to the next node
+    if (mpi_rank > 0) {
+        ggml_mpi_tensor_send(gf->nodes[gf->n_nodes - 1], (mpi_rank + 1) % mpi_size);
+    }
+}
diff --git a/ggml-mpi.h b/ggml-mpi.h
new file mode 100644
index 000000000..eda119d44
--- /dev/null
+++ b/ggml-mpi.h
@@ -0,0 +1,39 @@
+#pragma once
+
+struct ggml_context;
+struct ggml_tensor;
+struct ggml_cgraph;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct ggml_mpi_context;
+
+void ggml_mpi_backend_init(void);
+void ggml_mpi_backend_free(void);
+
+struct ggml_mpi_context * ggml_mpi_init(void);
+void ggml_mpi_free(struct ggml_mpi_context * ctx);
+
+int ggml_mpi_rank(struct ggml_mpi_context * ctx);
+
+void ggml_mpi_eval_init(
+        struct ggml_mpi_context * ctx_mpi,
+                            int * n_tokens,
+                            int * n_past,
+                            int * n_threads);
+
+void ggml_mpi_graph_compute_pre(
+        struct ggml_mpi_context * ctx_mpi,
+             struct ggml_cgraph * gf,
+                            int   n_layers);
+
+void ggml_mpi_graph_compute_post(
+        struct ggml_mpi_context * ctx_mpi,
+             struct ggml_cgraph * gf,
+                            int   n_layers);
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/ggml-opencl.cpp b/ggml-opencl.cpp
index 8573ac2d6..75c8aa560 100644
--- a/ggml-opencl.cpp
+++ b/ggml-opencl.cpp
@@ -653,13 +653,17 @@ __kernel void dequantize_mul_mat_vec_q6_K(__global const struct block_q6_K * xx,
     const int im = tid/step;                             // 0 or 1. 0 computes 0..., 1 computes 128...
     const int in = tid - step*im;                        // 0...15 or 0...7
 
-#if K_QUANTS_PER_ITERATION == 1
+\n#if K_QUANTS_PER_ITERATION == 1\n
     const int l0 = K_QUANTS_PER_ITERATION*in;            // 0...15
     const int is = 0;
-#else
+
+\n#else\n
+
     const int l0 = 4 * in;                               // 0, 4, 8, ..., 28
     const int is = in / 4;
-#endif
+
+\n#endif\n
+
     const int ql_offset = 64*im + l0;
     const int qh_offset = 32*im + l0;
     const int s_offset  =  8*im + is;
@@ -676,7 +680,7 @@ __kernel void dequantize_mul_mat_vec_q6_K(__global const struct block_q6_K * xx,
 
         const float d = vload_half(0, &x[i].d);
 
-#if K_QUANTS_PER_ITERATION == 1
+\n#if K_QUANTS_PER_ITERATION == 1\n
         float sum = y[ 0] * s[0] * d * ((int8_t)((ql[ 0] & 0xF) | ((qh[ 0] & 0x03) << 4)) - 32)
                   + y[16] * s[1] * d * ((int8_t)((ql[16] & 0xF) | ((qh[16] & 0x03) << 4)) - 32)
                   + y[32] * s[2] * d * ((int8_t)((ql[32] & 0xF) | ((qh[ 0] & 0x0c) << 2)) - 32)
@@ -686,7 +690,7 @@ __kernel void dequantize_mul_mat_vec_q6_K(__global const struct block_q6_K * xx,
                   + y[96] * s[6] * d * ((int8_t)((ql[32]  >> 4) | ((qh[ 0] & 0xc0) >> 2)) - 32)
                   +y[112] * s[7] * d * ((int8_t)((ql[48]  >> 4) | ((qh[16] & 0xc0) >> 2)) - 32);
         tmp[16 * ix + tid] += sum;
-#else
+\n#else\n
         float sum = 0;
         for (int l = 0; l < 4; ++l) {
             sum += y[l+ 0] * s[0] * d * ((int8_t)((ql[l+ 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32)
@@ -695,7 +699,7 @@ __kernel void dequantize_mul_mat_vec_q6_K(__global const struct block_q6_K * xx,
                  + y[l+96] * s[6] * d * ((int8_t)((ql[l+32]  >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32);
         }
         tmp[16 * ix + tid] += sum;
-#endif
+\n#endif\n
 
     }
 
@@ -1376,7 +1380,7 @@ static void ggml_cl_mul_f32(const ggml_tensor * src0, const ggml_tensor * src1,
     const int64_t ne00 = src0->ne[0];
     const int64_t ne01 = src0->ne[1];
     const int64_t ne02 = src0->ne[2];
-    const int64_t ne03 = src0->ne[2];
+    const int64_t ne03 = src0->ne[3];
     const int64_t ne0 = ne00 * ne01 * ne02 * ne03;
     const int64_t ne10 = src1->ne[0];
     const int64_t ne11 = src1->ne[1];
diff --git a/ggml.c b/ggml.c
index c52717100..0d3a15a04 100644
--- a/ggml.c
+++ b/ggml.c
@@ -25,16 +25,23 @@
 #include <float.h>
 #include <limits.h>
 #include <stdarg.h>
+#include <signal.h>
 
 #ifdef GGML_USE_METAL
 #include <unistd.h>
 #endif
 
+// static_assert should be a #define, but if it's not,
+// fall back to the _Static_assert C11 keyword.
 // if C99 - static_assert is noop
 // ref: https://stackoverflow.com/a/53923785/4039976
 #ifndef static_assert
+#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201100L)
+#define static_assert(cond, msg) _Static_assert(cond, msg)
+#else
 #define static_assert(cond, msg) struct global_scope_noop_trick
 #endif
+#endif
 
 #if defined(_MSC_VER)
 // disable "possible loss of data" to avoid hundreds of casts
@@ -49,23 +56,23 @@
 typedef volatile LONG atomic_int;
 typedef atomic_int atomic_bool;
 
-static void atomic_store(atomic_int* ptr, LONG val) {
+static void atomic_store(atomic_int * ptr, LONG val) {
     InterlockedExchange(ptr, val);
 }
-static LONG atomic_load(atomic_int* ptr) {
+static LONG atomic_load(atomic_int * ptr) {
     return InterlockedCompareExchange(ptr, 0, 0);
 }
-static LONG atomic_fetch_add(atomic_int* ptr, LONG inc) {
+static LONG atomic_fetch_add(atomic_int * ptr, LONG inc) {
     return InterlockedExchangeAdd(ptr, inc);
 }
-static LONG atomic_fetch_sub(atomic_int* ptr, LONG dec) {
+static LONG atomic_fetch_sub(atomic_int * ptr, LONG dec) {
     return atomic_fetch_add(ptr, -(dec));
 }
 
 typedef HANDLE pthread_t;
 
 typedef DWORD thread_ret_t;
-static int pthread_create(pthread_t* out, void* unused, thread_ret_t(*func)(void*), void* arg) {
+static int pthread_create(pthread_t * out, void * unused, thread_ret_t(*func)(void *), void * arg) {
     (void) unused;
     HANDLE handle = CreateThread(NULL, 0, (LPTHREAD_START_ROUTINE) func, arg, 0, NULL);
     if (handle == NULL)
@@ -77,7 +84,7 @@ static int pthread_create(pthread_t* out, void* unused, thread_ret_t(*func)(void
     return 0;
 }
 
-static int pthread_join(pthread_t thread, void* unused) {
+static int pthread_join(pthread_t thread, void * unused) {
     (void) unused;
     return (int) WaitForSingleObject(thread, INFINITE);
 }
@@ -90,7 +97,7 @@ static int sched_yield (void) {
 #include <pthread.h>
 #include <stdatomic.h>
 
-typedef void* thread_ret_t;
+typedef void * thread_ret_t;
 
 #include <sys/types.h>
 #include <sys/stat.h>
@@ -111,10 +118,6 @@ typedef void* thread_ret_t;
 #endif
 #endif
 
-#ifdef __HAIKU__
-#define static_assert(cond, msg) _Static_assert(cond, msg)
-#endif
-
 /*#define GGML_PERF*/
 #define GGML_DEBUG 0
 #define GGML_GELU_FP16
@@ -220,16 +223,38 @@ inline static void* ggml_aligned_malloc(size_t size) {
 #define GGML_ALIGNED_FREE(ptr)     free(ptr)
 #endif
 
-#define UNUSED(x) (void)(x)
+#define UNUSED GGML_UNUSED
 #define SWAP(x, y, T) do { T SWAP = x; x = y; y = SWAP; } while (0)
 
+//
+// tensor access macros
+//
+
+#define GGML_TENSOR_UNARY_OP_LOCALS \
+    GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne); \
+    GGML_TENSOR_LOCALS(size_t,  nb0, src0, nb); \
+    GGML_TENSOR_LOCALS(int64_t, ne,  dst,  ne); \
+    GGML_TENSOR_LOCALS(size_t,  nb,  dst,  nb);
+
+#define GGML_TENSOR_BINARY_OP_LOCALS \
+    GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne); \
+    GGML_TENSOR_LOCALS(size_t,  nb0, src0, nb); \
+    GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne); \
+    GGML_TENSOR_LOCALS(size_t,  nb1, src1, nb); \
+    GGML_TENSOR_LOCALS(int64_t, ne,  dst,  ne); \
+    GGML_TENSOR_LOCALS(size_t,  nb,  dst,  nb);
+
 #if defined(GGML_USE_ACCELERATE)
 #include <Accelerate/Accelerate.h>
 #if defined(GGML_USE_CLBLAST) // allow usage of CLBlast alongside Accelerate functions
 #include "ggml-opencl.h"
 #endif
 #elif defined(GGML_USE_OPENBLAS)
+#if defined(GGML_BLAS_USE_MKL)
+#include <mkl.h>
+#else
 #include <cblas.h>
+#endif
 #elif defined(GGML_USE_CUBLAS)
 #include "ggml-cuda.h"
 #elif defined(GGML_USE_CLBLAST)
@@ -465,14 +490,14 @@ ggml_fp16_t ggml_fp32_to_fp16(float x) {
     return GGML_FP32_TO_FP16(x);
 }
 
-void ggml_fp16_to_fp32_row(const ggml_fp16_t * x, float * y, size_t n) {
-    for (size_t i = 0; i < n; i++) {
+void ggml_fp16_to_fp32_row(const ggml_fp16_t * x, float * y, int n) {
+    for (int i = 0; i < n; i++) {
         y[i] = GGML_FP16_TO_FP32(x[i]);
     }
 }
 
-void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, size_t n) {
-    size_t i = 0;
+void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, int n) {
+    int i = 0;
 #if defined(__F16C__)
     for (; i + 7 < n; i += 8) {
         __m256 x_vec = _mm256_loadu_ps(x + i);
@@ -1611,109 +1636,112 @@ static void dequantize_row_q8_0(const void * restrict vx, float * restrict y, in
     }
 }
 
+static void ggml_vec_dot_f32(const int n, float * restrict s, const float * restrict x, const float * restrict y);
+static void ggml_vec_dot_f16(const int n, float * restrict s, ggml_fp16_t * restrict x, ggml_fp16_t * restrict y);
 static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy);
 static void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void * restrict vx, const void * restrict vy);
 static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy);
 static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void * restrict vx, const void * restrict vy);
 static void ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy);
 
-static const quantize_fns_t quantize_fns[GGML_TYPE_COUNT] = {
+static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
+    [GGML_TYPE_F32] = {
+        .vec_dot                  = (ggml_vec_dot_t) ggml_vec_dot_f32,
+        .vec_dot_type             = GGML_TYPE_F32,
+    },
+    [GGML_TYPE_F16] = {
+        .to_float                 = (ggml_to_float_t) ggml_fp16_to_fp32_row,
+        .from_float               = (ggml_from_float_t) ggml_fp32_to_fp16_row,
+        .from_float_reference     = (ggml_from_float_t) ggml_fp32_to_fp16_row,
+        .vec_dot                  = (ggml_vec_dot_t) ggml_vec_dot_f16,
+        .vec_dot_type             = GGML_TYPE_F16,
+    },
     [GGML_TYPE_Q4_0] = {
-        .dequantize_row_q         = (dequantize_row_q_t) dequantize_row_q4_0,
-        .quantize_row_q           = quantize_row_q4_0,
-        .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q4_0_reference,
-        .quantize_row_q_dot       = quantize_row_q8_0,
-        .vec_dot_q                = ggml_vec_dot_q4_0_q8_0,
+        .to_float                 = (ggml_to_float_t) dequantize_row_q4_0,
+        .from_float               = quantize_row_q4_0,
+        .from_float_reference     = (ggml_from_float_t) quantize_row_q4_0_reference,
+        .vec_dot                  = ggml_vec_dot_q4_0_q8_0,
         .vec_dot_type             = GGML_TYPE_Q8_0,
     },
     [GGML_TYPE_Q4_1] = {
-        .dequantize_row_q         = (dequantize_row_q_t)dequantize_row_q4_1,
-        .quantize_row_q           = quantize_row_q4_1,
-        .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q4_1_reference,
-        .quantize_row_q_dot       = quantize_row_q8_1,
-        .vec_dot_q                = ggml_vec_dot_q4_1_q8_1,
+        .to_float                 = (ggml_to_float_t) dequantize_row_q4_1,
+        .from_float               = quantize_row_q4_1,
+        .from_float_reference     = (ggml_from_float_t) quantize_row_q4_1_reference,
+        .vec_dot                  = ggml_vec_dot_q4_1_q8_1,
         .vec_dot_type             = GGML_TYPE_Q8_1,
     },
     [GGML_TYPE_Q5_0] = {
-        .dequantize_row_q         = (dequantize_row_q_t) dequantize_row_q5_0,
-        .quantize_row_q           = quantize_row_q5_0,
-        .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q5_0_reference,
-        .quantize_row_q_dot       = quantize_row_q8_0,
-        .vec_dot_q                = ggml_vec_dot_q5_0_q8_0,
+        .to_float                 = (ggml_to_float_t) dequantize_row_q5_0,
+        .from_float               = quantize_row_q5_0,
+        .from_float_reference     = (ggml_from_float_t) quantize_row_q5_0_reference,
+        .vec_dot                  = ggml_vec_dot_q5_0_q8_0,
         .vec_dot_type             = GGML_TYPE_Q8_0,
     },
     [GGML_TYPE_Q5_1] = {
-        .dequantize_row_q         = (dequantize_row_q_t) dequantize_row_q5_1,
-        .quantize_row_q           = quantize_row_q5_1,
-        .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q5_1_reference,
-        .quantize_row_q_dot       = quantize_row_q8_1,
-        .vec_dot_q                = ggml_vec_dot_q5_1_q8_1,
+        .to_float                 = (ggml_to_float_t) dequantize_row_q5_1,
+        .from_float               = quantize_row_q5_1,
+        .from_float_reference     = (ggml_from_float_t) quantize_row_q5_1_reference,
+        .vec_dot                  = ggml_vec_dot_q5_1_q8_1,
         .vec_dot_type             = GGML_TYPE_Q8_1,
     },
     [GGML_TYPE_Q8_0] = {
-        .dequantize_row_q         = dequantize_row_q8_0,
-        .quantize_row_q           = quantize_row_q8_0,
-        .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q8_0_reference,
-        .quantize_row_q_dot       = quantize_row_q8_0,
-        .vec_dot_q                = ggml_vec_dot_q8_0_q8_0,
+        .to_float                 = dequantize_row_q8_0,
+        .from_float               = quantize_row_q8_0,
+        .from_float_reference     = (ggml_from_float_t) quantize_row_q8_0_reference,
+        .vec_dot                  = ggml_vec_dot_q8_0_q8_0,
         .vec_dot_type             = GGML_TYPE_Q8_0,
     },
     [GGML_TYPE_Q8_1] = {
-        .dequantize_row_q         = NULL,   // TODO
-        .quantize_row_q           = quantize_row_q8_1,
-        .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q8_1_reference,
-        .quantize_row_q_dot       = quantize_row_q8_1,
-        .vec_dot_q                = NULL,   // TODO
+        .from_float               = quantize_row_q8_1,
+        .from_float_reference     = (ggml_from_float_t) quantize_row_q8_1_reference,
         .vec_dot_type             = GGML_TYPE_Q8_1,
     },
 #ifdef GGML_USE_K_QUANTS
     [GGML_TYPE_Q2_K] = {
-        .dequantize_row_q         = (dequantize_row_q_t) dequantize_row_q2_K,
-        .quantize_row_q           = quantize_row_q2_K,
-        .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q2_K_reference,
-        .quantize_row_q_dot       = quantize_row_q8_K,
-        .vec_dot_q                = ggml_vec_dot_q2_K_q8_K,
+        .to_float                 = (ggml_to_float_t) dequantize_row_q2_K,
+        .from_float               = quantize_row_q2_K,
+        .from_float_reference     = (ggml_from_float_t) quantize_row_q2_K_reference,
+        .vec_dot                  = ggml_vec_dot_q2_K_q8_K,
         .vec_dot_type             = GGML_TYPE_Q8_K,
     },
     [GGML_TYPE_Q3_K] = {
-        .dequantize_row_q         = (dequantize_row_q_t) dequantize_row_q3_K,
-        .quantize_row_q           = quantize_row_q3_K,
-        .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q3_K_reference,
-        .quantize_row_q_dot       = quantize_row_q8_K,
-        .vec_dot_q                = ggml_vec_dot_q3_K_q8_K,
+        .to_float                 = (ggml_to_float_t) dequantize_row_q3_K,
+        .from_float               = quantize_row_q3_K,
+        .from_float_reference     = (ggml_from_float_t) quantize_row_q3_K_reference,
+        .vec_dot                  = ggml_vec_dot_q3_K_q8_K,
         .vec_dot_type             = GGML_TYPE_Q8_K,
     },
     [GGML_TYPE_Q4_K] = {
-        .dequantize_row_q         = (dequantize_row_q_t) dequantize_row_q4_K,
-        .quantize_row_q           = quantize_row_q4_K,
-        .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q4_K_reference,
-        .quantize_row_q_dot       = quantize_row_q8_K,
-        .vec_dot_q                = ggml_vec_dot_q4_K_q8_K,
+        .to_float                 = (ggml_to_float_t) dequantize_row_q4_K,
+        .from_float               = quantize_row_q4_K,
+        .from_float_reference     = (ggml_from_float_t) quantize_row_q4_K_reference,
+        .vec_dot                  = ggml_vec_dot_q4_K_q8_K,
         .vec_dot_type             = GGML_TYPE_Q8_K,
     },
     [GGML_TYPE_Q5_K] = {
-        .dequantize_row_q         = (dequantize_row_q_t) dequantize_row_q5_K,
-        .quantize_row_q           = quantize_row_q5_K,
-        .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q5_K_reference,
-        .quantize_row_q_dot       = quantize_row_q8_K,
-        .vec_dot_q                = ggml_vec_dot_q5_K_q8_K,
+        .to_float                 = (ggml_to_float_t) dequantize_row_q5_K,
+        .from_float               = quantize_row_q5_K,
+        .from_float_reference     = (ggml_from_float_t) quantize_row_q5_K_reference,
+        .vec_dot                  = ggml_vec_dot_q5_K_q8_K,
         .vec_dot_type             = GGML_TYPE_Q8_K,
     },
     [GGML_TYPE_Q6_K] = {
-        .dequantize_row_q         = (dequantize_row_q_t) dequantize_row_q6_K,
-        .quantize_row_q           = quantize_row_q6_K,
-        .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q6_K_reference,
-        .quantize_row_q_dot       = quantize_row_q8_K,
-        .vec_dot_q                = ggml_vec_dot_q6_K_q8_K,
+        .to_float                 = (ggml_to_float_t) dequantize_row_q6_K,
+        .from_float               = quantize_row_q6_K,
+        .from_float_reference     = (ggml_from_float_t) quantize_row_q6_K_reference,
+        .vec_dot                  = ggml_vec_dot_q6_K_q8_K,
         .vec_dot_type             = GGML_TYPE_Q8_K,
     },
+    [GGML_TYPE_Q8_K] = {
+        .from_float               = quantize_row_q8_K,
+    }
 #endif
 };
 
 // For internal test use
-quantize_fns_t ggml_internal_get_quantize_fn(size_t i) {
+ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type i) {
     GGML_ASSERT(i < GGML_TYPE_COUNT);
-    return quantize_fns[i];
+    return type_traits[i];
 }
 
 
@@ -2259,7 +2287,7 @@ inline static void ggml_vec_neg_f32 (const int n, float * y, const float * x)
 inline static void ggml_vec_mul_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i]  = x[i]*y[i];   }
 inline static void ggml_vec_div_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i]  = x[i]/y[i];   }
 
-inline static void ggml_vec_dot_f32(const int n, float * restrict s, const float * restrict x, const float * restrict y) {
+static void ggml_vec_dot_f32(const int n, float * restrict s, const float * restrict x, const float * restrict y) {
 #ifdef GGML_SIMD
     float sumf = 0.0f;
     const int np = (n & ~(GGML_F32_STEP - 1));
@@ -2296,7 +2324,7 @@ inline static void ggml_vec_dot_f32(const int n, float * restrict s, const float
     *s = sumf;
 }
 
-inline static void ggml_vec_dot_f16(const int n, float * restrict s, ggml_fp16_t * restrict x, ggml_fp16_t * restrict y) {
+static void ggml_vec_dot_f16(const int n, float * restrict s, ggml_fp16_t * restrict x, ggml_fp16_t * restrict y) {
     ggml_float sumf = 0.0;
 
 #if defined(GGML_SIMD)
@@ -3449,6 +3477,8 @@ inline static void ggml_vec_log_f32  (const int n, float * y, const float * x) {
 inline static void ggml_vec_abs_f32  (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = fabsf(x[i]); }
 inline static void ggml_vec_sgn_f32  (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? 1.f : ((x[i] < 0.f) ? -1.f : 0.f); }
 inline static void ggml_vec_step_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? 1.f : 0.f; }
+inline static void ggml_vec_tanh_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = tanhf(x[i]);  }
+inline static void ggml_vec_elu_f32  (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : expf(x[i])-1; }
 inline static void ggml_vec_relu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : 0.f; }
 
 static const float GELU_COEF_A    = 0.044715f;
@@ -3600,6 +3630,16 @@ inline static void ggml_vec_norm_inv_f32(const int n, float * s, const float * x
     *s = 1.f/(*s);
 }
 
+inline static void ggml_vec_argmax_f32(const int n, int * s, const float * x) {
+    float max = -INFINITY;
+    int idx = 0;
+    for (int i = 0; i < n; ++i) {
+        max = MAX(max, x[i]);
+        if (max == x[i]) { idx = i; }
+    }
+    *s = idx;
+}
+
 //
 // data types
 //
@@ -3709,12 +3749,15 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
     "SUM",
     "SUM_ROWS",
     "MEAN",
+    "ARGMAX",
     "REPEAT",
     "REPEAT_BACK",
     "ABS",
     "SGN",
     "NEG",
     "STEP",
+    "TANH",
+    "ELU",
     "RELU",
     "GELU",
     "GELU_QUICK",
@@ -3746,9 +3789,10 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
     "ROPE_BACK",
     "ALIBI",
     "CLAMP",
-    "CONV_1D_S1_PH",
-    "CONV_1D_S2_PH",
-    "CONV_2D_SK_P0",
+    "CONV_1D",
+    "CONV_2D",
+    "POOL_1D",
+    "POOL_2D",
 
     "FLASH_ATTN",
     "FLASH_FF",
@@ -3767,7 +3811,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
     "CROSS_ENTROPY_LOSS_BACK",
 };
 
-static_assert(GGML_OP_COUNT == 64, "GGML_OP_COUNT != 64");
+static_assert(GGML_OP_COUNT == 68, "GGML_OP_COUNT != 68");
 
 static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
     "none",
@@ -3785,12 +3829,15 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
     "Σx",
     "Σx_k",
     "Σx/n",
+    "argmax(x)",
     "repeat(x)",
     "repeat_back(x)",
     "abs(x)",
     "sgn(x)",
     "-x",
     "step(x)",
+    "tanh(x)",
+    "elu(x)",
     "relu(x)",
     "gelu(x)",
     "gelu_quick(x)",
@@ -3822,9 +3869,10 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
     "rope_back(x)",
     "alibi(x)",
     "clamp(x)",
-    "conv_1d_s1_ph(x)",
-    "conv_1d_s2_ph(x)",
-    "conv_2d_sk_p0(x)",
+    "conv_1d(x)",
+    "conv_2d(x)",
+    "pool_1d(x)",
+    "pool_2d(x)",
 
     "flash_attn(x)",
     "flash_ff(x)",
@@ -3843,11 +3891,47 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
     "cross_entropy_loss_back(x,y)",
 };
 
-static_assert(GGML_OP_COUNT == 64, "GGML_OP_COUNT != 64");
+static_assert(GGML_OP_COUNT == 68, "GGML_OP_COUNT != 68");
+
+static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
 
 static_assert(sizeof(struct ggml_object)%GGML_MEM_ALIGN == 0, "ggml_object size must be a multiple of GGML_MEM_ALIGN");
 static_assert(sizeof(struct ggml_tensor)%GGML_MEM_ALIGN == 0, "ggml_tensor size must be a multiple of GGML_MEM_ALIGN");
 
+// WARN:
+// Mis-confguration can lead to problem that's hard to reason about:
+// * At best  it crash or talks nosense.
+// * At worst it talks slightly difference but hard to perceive.
+//
+// An op has to enable INIT or FINALIZE when any of it's branch needs that pass.
+// Take care about compile options (e.g., GGML_USE_xxx).
+static bool GGML_OP_HAS_INIT    [GGML_OP_COUNT] = { 0 };
+static bool GGML_OP_HAS_FINALIZE[GGML_OP_COUNT] = { 0 };
+
+static void ggml_setup_op_has_task_pass(void) {
+    {   // INIT
+        bool * p = GGML_OP_HAS_INIT;
+
+        p[GGML_OP_ACC                    ] = true;
+        p[GGML_OP_MUL_MAT                ] = true;
+        p[GGML_OP_OUT_PROD               ] = true;
+        p[GGML_OP_SET                    ] = true;
+        p[GGML_OP_GET_ROWS_BACK          ] = true;
+        p[GGML_OP_DIAG_MASK_INF          ] = true;
+        p[GGML_OP_DIAG_MASK_ZERO         ] = true;
+        p[GGML_OP_CONV_1D                ] = true;
+        p[GGML_OP_CONV_2D                ] = true;
+        p[GGML_OP_FLASH_ATTN_BACK        ] = true;
+        p[GGML_OP_CROSS_ENTROPY_LOSS     ] = true;
+    }
+
+    {   // FINALIZE
+        bool * p = GGML_OP_HAS_FINALIZE;
+
+        p[GGML_OP_CROSS_ENTROPY_LOSS     ] = true;
+    }
+}
+
 //
 // ggml context
 //
@@ -4088,10 +4172,9 @@ static inline bool ggml_is_matrix(const struct ggml_tensor * tensor) {
 static inline bool ggml_can_mul_mat(const struct ggml_tensor * t0, const struct ggml_tensor * t1) {
     static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
 
-    return
-        (t0->ne[0] == t1->ne[0])  &&
-        (t0->ne[2] == t1->ne[2])  &&
-        (t0->ne[3] == t1->ne[3]);
+    return (t0->ne[0]           == t1->ne[0])  &&
+           (t1->ne[2]%t0->ne[2] == 0)          && // verify t0 is broadcastable
+           (t1->ne[3]%t0->ne[3] == 0);
 }
 
 static inline bool ggml_can_out_prod(const struct ggml_tensor * t0, const struct ggml_tensor * t1) {
@@ -4271,6 +4354,8 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
         ggml_vk_init();
 #endif
 
+        ggml_setup_op_has_task_pass();
+
         is_first_call = false;
     }
 
@@ -4331,8 +4416,8 @@ void ggml_free(struct ggml_context * ctx) {
         if (&g_state.contexts[i].context == ctx) {
             g_state.contexts[i].used = false;
 
-            GGML_PRINT_DEBUG("%s: context %d with %d objects has been freed. memory used = %zu\n",
-                    __func__, i, ctx->n_objects, ctx->objects_end->offs + ctx->objects_end->size);
+            GGML_PRINT_DEBUG("%s: context %d has been freed. memory used = %zu\n",
+                    __func__, i, ggml_used_mem(ctx));
 
             if (ctx->mem_buffer_owned) {
                 GGML_ALIGNED_FREE(ctx->mem_buffer);
@@ -4511,17 +4596,14 @@ struct ggml_tensor * ggml_new_tensor_impl(
         /*.op           =*/ GGML_OP_NONE,
         /*.is_param     =*/ false,
         /*.grad         =*/ NULL,
-        /*.src0         =*/ NULL,
-        /*.src1         =*/ NULL,
-        /*.opt          =*/ { NULL },
-        /*.n_tasks      =*/ 0,
+        /*.src          =*/ { NULL },
         /*.perf_runs    =*/ 0,
         /*.perf_cycles  =*/ 0,
         /*.perf_time_us =*/ 0,
         /*.data         =*/ (data == NULL && !ctx->no_alloc) ? (void *)(result + 1) : data,
         /*.name         =*/ { 0 },
         /*.extra        =*/ NULL,
-        /*.pad          =*/ { 0 },
+        /*.padding      =*/ { 0 },
     };
 
     // TODO: this should not be needed as long as we don't rely on aligned SIMD loads
@@ -4653,7 +4735,7 @@ struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value) {
             {
                 assert(tensor->nb[0] == sizeof(ggml_fp16_t));
                 for (int i = 0; i < n; i++) {
-                    ggml_vec_set_f16(nc, (ggml_fp16_t *)(data + i*n1), value);
+                    ggml_vec_set_f16(nc, (ggml_fp16_t *)(data + i*n1), GGML_FP32_TO_FP16(value));
                 }
             } break;
         case GGML_TYPE_F32:
@@ -4705,7 +4787,7 @@ struct ggml_tensor * ggml_set_f32(struct ggml_tensor * tensor, float value) {
             {
                 assert(tensor->nb[0] == sizeof(ggml_fp16_t));
                 for (int i = 0; i < n; i++) {
-                    ggml_vec_set_f16(nc, (ggml_fp16_t *)(data + i*n1), value);
+                    ggml_vec_set_f16(nc, (ggml_fp16_t *)(data + i*n1), GGML_FP32_TO_FP16(value));
                 }
             } break;
         case GGML_TYPE_F32:
@@ -4940,8 +5022,8 @@ struct ggml_tensor * ggml_dup_impl(
 
     result->op   = GGML_OP_DUP;
     result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src0 = a;
-    result->src1 = NULL;
+    result->src[0] = a;
+    result->src[1] = NULL;
 
     return result;
 }
@@ -4965,11 +5047,15 @@ struct ggml_tensor * ggml_add_impl(
         struct ggml_tensor * a,
         struct ggml_tensor * b,
         bool inplace) {
-    GGML_ASSERT(ggml_are_same_shape(a, b));
+    // TODO: support less-strict constraint
+    //       GGML_ASSERT(ggml_can_repeat(b, a));
+    GGML_ASSERT(ggml_can_repeat_rows(b, a));
 
     bool is_node = false;
 
-    if (a->grad || b->grad) {
+    if (!inplace && (a->grad || b->grad)) {
+        // TODO: support backward pass for broadcasting
+        GGML_ASSERT(ggml_are_same_shape(a, b));
         is_node = true;
     }
 
@@ -4977,8 +5063,8 @@ struct ggml_tensor * ggml_add_impl(
 
     result->op   = GGML_OP_ADD;
     result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src0 = a;
-    result->src1 = b;
+    result->src[0] = a;
+    result->src[1] = b;
 
     return result;
 }
@@ -5017,8 +5103,8 @@ struct ggml_tensor * ggml_add1_impl(
 
     result->op   = GGML_OP_ADD1;
     result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src0 = a;
-    result->src1 = b;
+    result->src[0] = a;
+    result->src[1] = b;
 
     return result;
 }
@@ -5075,9 +5161,9 @@ struct ggml_tensor * ggml_acc_impl(
 
     result->op   = GGML_OP_ACC;
     result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src0 = a;
-    result->src1 = b;
-    result->opt[0] = c;
+    result->src[0] = a;
+    result->src[1] = b;
+    result->src[2] = c;
 
     return result;
 }
@@ -5123,8 +5209,8 @@ struct ggml_tensor * ggml_sub_impl(
 
     result->op   = GGML_OP_SUB;
     result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src0 = a;
-    result->src1 = b;
+    result->src[0] = a;
+    result->src[1] = b;
 
     return result;
 }
@@ -5170,8 +5256,8 @@ struct ggml_tensor * ggml_mul_impl(
 
     result->op   = GGML_OP_MUL;
     result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src0 = a;
-    result->src1 = b;
+    result->src[0] = a;
+    result->src[1] = b;
 
     return result;
 }
@@ -5213,8 +5299,8 @@ struct ggml_tensor * ggml_div_impl(
 
     result->op   = GGML_OP_DIV;
     result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src0 = a;
-    result->src1 = b;
+    result->src[0] = a;
+    result->src[1] = b;
 
     return result;
 }
@@ -5249,8 +5335,8 @@ struct ggml_tensor * ggml_sqr_impl(
 
     result->op   = GGML_OP_SQR;
     result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src0 = a;
-    result->src1 = NULL;
+    result->src[0] = a;
+    result->src[1] = NULL;
 
     return result;
 }
@@ -5283,8 +5369,8 @@ struct ggml_tensor * ggml_sqrt_impl(
 
     result->op   = GGML_OP_SQRT;
     result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src0 = a;
-    result->src1 = NULL;
+    result->src[0] = a;
+    result->src[1] = NULL;
 
     return result;
 }
@@ -5318,8 +5404,8 @@ struct ggml_tensor * ggml_log_impl(
 
     result->op   = GGML_OP_LOG;
     result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src0 = a;
-    result->src1 = NULL;
+    result->src[0] = a;
+    result->src[1] = NULL;
 
     return result;
 }
@@ -5351,8 +5437,8 @@ struct ggml_tensor * ggml_sum(
 
     result->op   = GGML_OP_SUM;
     result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src0 = a;
-    result->src1 = NULL;
+    result->src[0] = a;
+    result->src[1] = NULL;
 
     return result;
 }
@@ -5378,8 +5464,8 @@ struct ggml_tensor * ggml_sum_rows(
 
     result->op   = GGML_OP_SUM_ROWS;
     result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src0 = a;
-    result->src1 = NULL;
+    result->src[0] = a;
+    result->src[1] = NULL;
 
     return result;
 }
@@ -5401,8 +5487,32 @@ struct ggml_tensor * ggml_mean(
 
     result->op   = GGML_OP_MEAN;
     result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src0 = a;
-    result->src1 = NULL;
+    result->src[0] = a;
+    result->src[1] = NULL;
+
+    return result;
+}
+
+// ggml_argmax
+
+struct ggml_tensor * ggml_argmax(
+        struct ggml_context * ctx,
+        struct ggml_tensor * a) {
+    GGML_ASSERT(ggml_is_matrix(a));
+    bool is_node = false;
+
+    if (a->grad) {
+        GGML_ASSERT(false);
+        is_node = true;
+    }
+
+    int64_t ne[GGML_MAX_DIMS] = { a->ne[1], 1, 1, 1 };
+    struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_I32, a->n_dims, ne);
+
+    result->op   = GGML_OP_ARGMAX;
+    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
+    result->src[0] = a;
+    result->src[1] = NULL;
 
     return result;
 }
@@ -5429,8 +5539,8 @@ struct ggml_tensor * ggml_repeat(
 
     result->op   = GGML_OP_REPEAT;
     result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src0 = a;
-    result->src1 = b;
+    result->src[0] = a;
+    result->src[1] = b;
 
     return result;
 }
@@ -5457,8 +5567,8 @@ struct ggml_tensor * ggml_repeat_back(
 
     result->op   = GGML_OP_REPEAT_BACK;
     result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src0 = a;
-    result->src1 = b;
+    result->src[0] = a;
+    result->src[1] = b;
 
     return result;
 }
@@ -5479,8 +5589,8 @@ struct ggml_tensor * ggml_abs_impl(
 
     result->op   = GGML_OP_ABS;
     result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src0 = a;
-    result->src1 = NULL;
+    result->src[0] = a;
+    result->src[1] = NULL;
 
     return result;
 }
@@ -5514,8 +5624,8 @@ struct ggml_tensor * ggml_sgn_impl(
 
     result->op   = GGML_OP_SGN;
     result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src0 = a;
-    result->src1 = NULL;
+    result->src[0] = a;
+    result->src[1] = NULL;
 
     return result;
 }
@@ -5548,8 +5658,8 @@ struct ggml_tensor * ggml_neg_impl(
 
     result->op   = GGML_OP_NEG;
     result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src0 = a;
-    result->src1 = NULL;
+    result->src[0] = a;
+    result->src[1] = NULL;
 
     return result;
 }
@@ -5582,8 +5692,8 @@ struct ggml_tensor * ggml_step_impl(
 
     result->op   = GGML_OP_STEP;
     result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src0 = a;
-    result->src1 = NULL;
+    result->src[0] = a;
+    result->src[1] = NULL;
 
     return result;
 }
@@ -5600,6 +5710,74 @@ struct ggml_tensor * ggml_step_inplace(
     return ggml_step_impl(ctx, a, true);
 }
 
+// ggml_tanh
+
+struct ggml_tensor * ggml_tanh_impl(
+        struct ggml_context * ctx,
+        struct ggml_tensor * a,
+        bool inplace) {
+    bool is_node = false;
+
+    if (!inplace && (a->grad)) {
+        is_node = true;
+    }
+
+    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
+
+    result->op   = GGML_OP_TANH;
+    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
+    result->src[0] = a;
+    result->src[1] = NULL;
+
+    return result;
+}
+
+struct ggml_tensor * ggml_tanh(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a) {
+    return ggml_tanh_impl(ctx, a, false);
+}
+
+struct ggml_tensor * ggml_tanh_inplace(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a) {
+    return ggml_tanh_impl(ctx, a, true);
+}
+
+// ggml_elu
+
+struct ggml_tensor * ggml_elu_impl(
+        struct ggml_context * ctx,
+        struct ggml_tensor * a,
+        bool inplace) {
+    bool is_node = false;
+
+    if (!inplace && (a->grad)) {
+        is_node = true;
+    }
+
+    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
+
+    result->op   = GGML_OP_ELU;
+    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
+    result->src[0] = a;
+    result->src[1] = NULL;
+
+    return result;
+}
+
+struct ggml_tensor * ggml_elu(
+    struct ggml_context * ctx,
+    struct ggml_tensor  * a) {
+    return ggml_elu_impl(ctx, a, false);
+}
+
+struct ggml_tensor * ggml_elu_inplace(
+    struct ggml_context * ctx,
+    struct ggml_tensor  * a) {
+    return ggml_elu_impl(ctx, a, true);
+}
+
 // ggml_relu
 
 struct ggml_tensor * ggml_relu_impl(
@@ -5616,8 +5794,8 @@ struct ggml_tensor * ggml_relu_impl(
 
     result->op   = GGML_OP_RELU;
     result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src0 = a;
-    result->src1 = NULL;
+    result->src[0] = a;
+    result->src[1] = NULL;
 
     return result;
 }
@@ -5650,8 +5828,8 @@ struct ggml_tensor * ggml_gelu_impl(
 
     result->op   = GGML_OP_GELU;
     result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src0 = a;
-    result->src1 = NULL;
+    result->src[0] = a;
+    result->src[1] = NULL;
 
     return result;
 }
@@ -5684,8 +5862,8 @@ struct ggml_tensor * ggml_gelu_quick_impl(
 
     result->op   = GGML_OP_GELU_QUICK;
     result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src0 = a;
-    result->src1 = NULL;
+    result->src[0] = a;
+    result->src[1] = NULL;
 
     return result;
 }
@@ -5718,8 +5896,8 @@ struct ggml_tensor * ggml_silu_impl(
 
     result->op   = GGML_OP_SILU;
     result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src0 = a;
-    result->src1 = NULL;
+    result->src[0] = a;
+    result->src[1] = NULL;
 
     return result;
 }
@@ -5753,8 +5931,8 @@ struct ggml_tensor * ggml_silu_back(
 
     result->op   = GGML_OP_SILU_BACK;
     result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src0 = a;
-    result->src1 = b;
+    result->src[0] = a;
+    result->src[1] = b;
 
     return result;
 }
@@ -5776,8 +5954,8 @@ struct ggml_tensor * ggml_norm_impl(
 
     result->op   = GGML_OP_NORM;
     result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src0 = a;
-    result->src1 = NULL; // TODO: maybe store epsilon here?
+    result->src[0] = a;
+    result->src[1] = NULL; // TODO: maybe store epsilon here?
 
     return result;
 }
@@ -5808,8 +5986,8 @@ struct ggml_tensor * ggml_rms_norm_impl(
 
     result->op   = GGML_OP_RMS_NORM;
     result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src0 = a;
-    result->src1 = NULL; // TODO: maybe store epsilon here?
+    result->src[0] = a;
+    result->src[1] = NULL; // TODO: maybe store epsilon here?
 
     return result;
 }
@@ -5841,8 +6019,8 @@ struct ggml_tensor * ggml_rms_norm_back(
 
     result->op   = GGML_OP_RMS_NORM_BACK;
     result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src0 = a;
-    result->src1 = b;
+    result->src[0] = a;
+    result->src[1] = b;
 
     return result;
 }
@@ -5863,13 +6041,13 @@ struct ggml_tensor * ggml_mul_mat(
         is_node = true;
     }
 
-    const int64_t ne[4] = { a->ne[1], b->ne[1], a->ne[2], b->ne[3] };
-    struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, MIN(a->n_dims, b->n_dims), ne);
+    const int64_t ne[4] = { a->ne[1], b->ne[1], b->ne[2], b->ne[3] };
+    struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, MAX(a->n_dims, b->n_dims), ne);
 
     result->op   = GGML_OP_MUL_MAT;
     result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src0 = a;
-    result->src1 = b;
+    result->src[0] = a;
+    result->src[1] = b;
 
     return result;
 }
@@ -5894,8 +6072,8 @@ struct ggml_tensor * ggml_out_prod(
 
     result->op   = GGML_OP_OUT_PROD;
     result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src0 = a;
-    result->src1 = b;
+    result->src[0] = a;
+    result->src[1] = b;
 
     return result;
 }
@@ -5920,8 +6098,8 @@ struct ggml_tensor * ggml_scale_impl(
 
     result->op   = GGML_OP_SCALE;
     result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src0 = a;
-    result->src1 = b;
+    result->src[0] = a;
+    result->src[1] = b;
 
     return result;
 }
@@ -5976,9 +6154,9 @@ struct ggml_tensor * ggml_set_impl(
 
     result->op   = GGML_OP_SET;
     result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src0 = a;
-    result->src1 = b;
-    result->opt[0] = c;
+    result->src[0] = a;
+    result->src[1] = b;
+    result->src[2] = c;
 
     return result;
 }
@@ -6065,8 +6243,8 @@ struct ggml_tensor * ggml_cpy_impl(
 
     result->op   = GGML_OP_CPY;
     result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src0 = a;
-    result->src1 = b;
+    result->src[0] = a;
+    result->src[1] = b;
 
     return result;
 }
@@ -6102,8 +6280,8 @@ struct ggml_tensor * ggml_cont_impl(
 
     result->op   = GGML_OP_CONT;
     result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src0 = a;
-    result->src1 = NULL;
+    result->src[0] = a;
+    result->src[1] = NULL;
 
     return result;
 }
@@ -6146,8 +6324,8 @@ struct ggml_tensor * ggml_reshape(
 
     result->op   = GGML_OP_RESHAPE;
     result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src0 = a;
-    result->src1 = NULL;
+    result->src[0] = a;
+    result->src[1] = NULL;
 
     return result;
 }
@@ -6171,8 +6349,8 @@ struct ggml_tensor * ggml_reshape_1d(
 
     result->op   = GGML_OP_RESHAPE;
     result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src0 = a;
-    result->src1 = NULL;
+    result->src[0] = a;
+    result->src[1] = NULL;
 
     return result;
 }
@@ -6197,8 +6375,8 @@ struct ggml_tensor * ggml_reshape_2d(
 
     result->op   = GGML_OP_RESHAPE;
     result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src0 = a;
-    result->src1 = NULL;
+    result->src[0] = a;
+    result->src[1] = NULL;
 
     return result;
 }
@@ -6224,8 +6402,8 @@ struct ggml_tensor * ggml_reshape_3d(
 
     result->op   = GGML_OP_RESHAPE;
     result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src0 = a;
-    result->src1 = NULL;
+    result->src[0] = a;
+    result->src[1] = NULL;
 
     return result;
 }
@@ -6253,8 +6431,8 @@ struct ggml_tensor * ggml_reshape_4d(
 
     result->op   = GGML_OP_RESHAPE;
     result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src0 = a;
-    result->src1 = NULL;
+    result->src[0] = a;
+    result->src[1] = NULL;
 
     return result;
 }
@@ -6286,9 +6464,9 @@ struct ggml_tensor * ggml_view_1d(
 
     result->op   = GGML_OP_VIEW;
     result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src0 = a;
-    result->src1 = NULL;
-    result->opt[0] = offs;
+    result->src[0] = a;
+    result->src[1] = NULL;
+    result->src[2] = offs;
 
     return result;
 }
@@ -6328,9 +6506,9 @@ struct ggml_tensor * ggml_view_2d(
 
     result->op   = GGML_OP_VIEW;
     result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src0 = a;
-    result->src1 = NULL;
-    result->opt[0] = offs;
+    result->src[0] = a;
+    result->src[1] = NULL;
+    result->src[2] = offs;
 
     return result;
 }
@@ -6372,9 +6550,9 @@ struct ggml_tensor * ggml_view_3d(
 
     result->op   = GGML_OP_VIEW;
     result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src0 = a;
-    result->src1 = NULL;
-    result->opt[0] = offs;
+    result->src[0] = a;
+    result->src[1] = NULL;
+    result->src[2] = offs;
 
     return result;
 }
@@ -6418,9 +6596,9 @@ struct ggml_tensor * ggml_view_4d(
 
     result->op   = GGML_OP_VIEW;
     result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src0 = a;
-    result->src1 = NULL;
-    result->opt[0] = offs;
+    result->src[0] = a;
+    result->src[1] = NULL;
+    result->src[2] = offs;
 
     return result;
 }
@@ -6480,8 +6658,8 @@ struct ggml_tensor * ggml_permute(
 
     result->op   = GGML_OP_PERMUTE;
     result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src0 = a;
-    result->src1 = NULL;
+    result->src[0] = a;
+    result->src[1] = NULL;
 
     if (is_node) {
         ggml_scratch_save(ctx);
@@ -6495,7 +6673,7 @@ struct ggml_tensor * ggml_permute(
 
         ggml_scratch_load(ctx);
 
-        result->opt[0] = b;
+        result->src[2] = b;
     }
 
     return result;
@@ -6523,8 +6701,8 @@ struct ggml_tensor * ggml_transpose(
 
     result->op   = GGML_OP_TRANSPOSE;
     result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src0 = a;
-    result->src1 = NULL;
+    result->src[0] = a;
+    result->src[1] = NULL;
 
     return result;
 }
@@ -6549,8 +6727,8 @@ struct ggml_tensor * ggml_get_rows(
 
     result->op   = GGML_OP_GET_ROWS;
     result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src0 = a;
-    result->src1 = b;
+    result->src[0] = a;
+    result->src[1] = b;
 
     return result;
 }
@@ -6577,9 +6755,9 @@ struct ggml_tensor * ggml_get_rows_back(
 
     result->op   = GGML_OP_GET_ROWS_BACK;
     result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src0 = a;
-    result->src1 = b;
-    result->opt[0] = c;
+    result->src[0] = a;
+    result->src[1] = b;
+    result->src[2] = c;
 
     return result;
 }
@@ -6601,8 +6779,8 @@ struct ggml_tensor * ggml_diag(
 
     result->op   = GGML_OP_DIAG;
     result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src0 = a;
-    result->src1 = NULL;
+    result->src[0] = a;
+    result->src[1] = NULL;
 
     return result;
 }
@@ -6634,8 +6812,8 @@ struct ggml_tensor * ggml_diag_mask_inf_impl(
 
     result->op   = GGML_OP_DIAG_MASK_INF;
     result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src0 = a;
-    result->src1 = b;
+    result->src[0] = a;
+    result->src[1] = b;
 
     return result;
 }
@@ -6682,8 +6860,8 @@ struct ggml_tensor * ggml_diag_mask_zero_impl(
 
     result->op   = GGML_OP_DIAG_MASK_ZERO;
     result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src0 = a;
-    result->src1 = b;
+    result->src[0] = a;
+    result->src[1] = b;
 
     return result;
 }
@@ -6718,8 +6896,8 @@ struct ggml_tensor * ggml_soft_max_impl(
 
     result->op   = GGML_OP_SOFT_MAX;
     result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src0 = a;
-    result->src1 = NULL;
+    result->src[0] = a;
+    result->src[1] = NULL;
 
     return result;
 }
@@ -6754,8 +6932,8 @@ struct ggml_tensor * ggml_soft_max_back_impl(
 
     result->op   = GGML_OP_SOFT_MAX_BACK;
     result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src0 = a;
-    result->src1 = b;
+    result->src[0] = a;
+    result->src[1] = b;
 
     return result;
 }
@@ -6782,6 +6960,8 @@ struct ggml_tensor * ggml_rope_impl(
         int                   n_past,
         int                   n_dims,
         int                   mode,
+        float                 freq_base,
+        float                 freq_scale,
         int                   n_ctx,
         bool                  inplace) {
     GGML_ASSERT(n_past >= 0);
@@ -6795,19 +6975,21 @@ struct ggml_tensor * ggml_rope_impl(
 
     ggml_scratch_save(ctx);
 
-    struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 4);
+    struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 6);
 
     ((int32_t *) b->data)[0] = n_past;
     ((int32_t *) b->data)[1] = n_dims;
     ((int32_t *) b->data)[2] = mode;
     ((int32_t *) b->data)[3] = n_ctx;
+    memcpy((int32_t *) b->data + 4, &freq_base,  sizeof(float));
+    memcpy((int32_t *) b->data + 5, &freq_scale, sizeof(float));
 
     ggml_scratch_load(ctx);
 
     result->op   = GGML_OP_ROPE;
     result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src0 = a;
-    result->src1 = b;
+    result->src[0] = a;
+    result->src[1] = b;
 
     return result;
 }
@@ -6819,7 +7001,7 @@ struct ggml_tensor * ggml_rope(
         int                   n_dims,
         int                   mode,
         int                   n_ctx) {
-    return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, false);
+    return ggml_rope_impl(ctx, a, n_past, n_dims, mode, 10000.0f, 1.0f, n_ctx, false);
 }
 
 struct ggml_tensor * ggml_rope_inplace(
@@ -6829,7 +7011,19 @@ struct ggml_tensor * ggml_rope_inplace(
         int                   n_dims,
         int                   mode,
         int                   n_ctx) {
-    return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, true);
+    return ggml_rope_impl(ctx, a, n_past, n_dims, mode, 10000.0f, 1.0f, n_ctx, true);
+}
+
+struct ggml_tensor * ggml_rope_custom_inplace(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        int                   n_past,
+        int                   n_dims,
+        int                   mode,
+        float                 freq_base,
+        float                 freq_scale,
+        int                   n_ctx) {
+    return ggml_rope_impl(ctx, a, n_past, n_dims, mode, freq_base, freq_scale, n_ctx, true);
 }
 
 // ggml_rope_back
@@ -6841,6 +7035,8 @@ struct ggml_tensor * ggml_rope_back(
         int                   n_dims,
         int                   mode) {
     GGML_ASSERT(n_past >= 0);
+    GGML_ASSERT((mode & 4) == 0 && "ggml_rope_back() for ChatGLM not implemented yet");
+
     bool is_node = false;
 
     if (a->grad) {
@@ -6862,8 +7058,8 @@ struct ggml_tensor * ggml_rope_back(
 
     result->op   = GGML_OP_ROPE_BACK;
     result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src0 = a;
-    result->src1 = b;
+    result->src[0] = a;
+    result->src[1] = b;
 
     return result;
 }
@@ -6901,8 +7097,8 @@ struct ggml_tensor * ggml_alibi(
 
     result->op   = GGML_OP_ALIBI;
     result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src0 = a;
-    result->src1 = b;
+    result->src[0] = a;
+    result->src[1] = b;
 
     return result;
 }
@@ -6935,21 +7131,27 @@ struct ggml_tensor * ggml_clamp(
 
     result->op   = GGML_OP_CLAMP;
     result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src0 = a;
-    result->src1 = b;
+    result->src[0] = a;
+    result->src[1] = b;
 
     return result;
 }
 
-// ggml_conv_1d_s1_ph
+// ggml_conv_1d
 
-struct ggml_tensor * ggml_conv_1d_s1_ph(
+static int64_t ggml_calc_conv_output_size(int64_t ins, int64_t ks, int s, int p, int d) {
+    return (ins + 2 * p - d * (ks - 1) - 1) / s + 1;
+}
+
+GGML_API struct ggml_tensor * ggml_conv_1d(
         struct ggml_context * ctx,
         struct ggml_tensor  * a,
-        struct ggml_tensor  * b) {
+        struct ggml_tensor  * b,
+        int                   s0,
+        int                   p0,
+        int                   d0) {
     GGML_ASSERT(ggml_is_matrix(b));
     GGML_ASSERT(a->ne[1] == b->ne[1]);
-    GGML_ASSERT(a->ne[3] == 1);
     bool is_node = false;
 
     if (a->grad || b->grad) {
@@ -6957,54 +7159,42 @@ struct ggml_tensor * ggml_conv_1d_s1_ph(
         is_node = true;
     }
 
-    const int64_t ne[4] = { b->ne[0], a->ne[2], 1, 1, };
-    struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 2, ne);
+    const int64_t ne[4] = {
+        ggml_calc_conv_output_size(b->ne[0], a->ne[0], s0, p0, d0),
+        a->ne[2], 1, 1,
+    };
+    struct ggml_tensor* result = ggml_new_tensor(ctx, GGML_TYPE_F32, 2, ne);
 
-    result->op   = GGML_OP_CONV_1D_S1_PH;
+    ggml_scratch_save(ctx);
+    struct ggml_tensor* c = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 3);
+    ((int32_t*)c->data)[0] = s0;
+    ((int32_t*)c->data)[1] = p0;
+    ((int32_t*)c->data)[2] = d0;
+    ggml_scratch_load(ctx);
+
+    result->op = GGML_OP_CONV_1D;
     result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src0 = a;
-    result->src1 = b;
+    result->src[0] = a;
+    result->src[1] = b;
+    result->src[2] = c;
 
     return result;
 }
 
-// ggml_conv_1d_s2_ph
+// ggml_conv_2d
 
-struct ggml_tensor * ggml_conv_1d_s2_ph(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        struct ggml_tensor  * b) {
-    GGML_ASSERT(ggml_is_matrix(b));
-    GGML_ASSERT(a->ne[1] == b->ne[1]);
-    GGML_ASSERT(a->ne[3] == 1);
-    bool is_node = false;
+struct ggml_tensor* ggml_conv_2d(
+    struct ggml_context* ctx,
+    struct ggml_tensor * a,
+    struct ggml_tensor * b,
+    int                  s0,
+    int                  s1,
+    int                  p0,
+    int                  p1,
+    int                  d0,
+    int                  d1) {
 
-    if (a->grad || b->grad) {
-        GGML_ASSERT(false); // TODO: implement backward
-        is_node = true;
-    }
-
-    const int64_t ne[4] = { b->ne[0]/2, a->ne[2], 1, 1, };
-    struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 2, ne);
-
-    result->op   = GGML_OP_CONV_1D_S2_PH;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src0 = a;
-    result->src1 = b;
-
-    return result;
-}
-
-// ggml_conv_2d_sk_p0
-
-struct ggml_tensor * ggml_conv_2d_sk_p0(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        struct ggml_tensor  * b) {
-    GGML_ASSERT(b->ne[3] == 1);
     GGML_ASSERT(a->ne[2] == b->ne[2]);
-    GGML_ASSERT(b->ne[0] %  a->ne[0] == 0);
-    GGML_ASSERT(b->ne[1] %  a->ne[1] == 0);
     bool is_node = false;
 
     if (a->grad || b->grad) {
@@ -7012,13 +7202,132 @@ struct ggml_tensor * ggml_conv_2d_sk_p0(
         is_node = true;
     }
 
-    const int64_t ne[4] = { b->ne[0]/a->ne[0], b->ne[1]/a->ne[1], a->ne[3], 1, };
-    struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
+    const int64_t ne[4] = {
+        ggml_calc_conv_output_size(b->ne[0], a->ne[0], s0, p0, d0),
+        ggml_calc_conv_output_size(b->ne[1], a->ne[1], s1, p1, d1),
+        a->ne[3], b->ne[3],
+    };
+    struct ggml_tensor* result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
 
-    result->op   = GGML_OP_CONV_2D_SK_P0;
+    ggml_scratch_save(ctx);
+    struct ggml_tensor* c = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 6);
+    ((int32_t*)c->data)[0] = s0;
+    ((int32_t*)c->data)[1] = s1;
+    ((int32_t*)c->data)[2] = p0;
+    ((int32_t*)c->data)[3] = p1;
+    ((int32_t*)c->data)[4] = d0;
+    ((int32_t*)c->data)[5] = d1;
+    ggml_scratch_load(ctx);
+
+    result->op = GGML_OP_CONV_2D;
     result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src0 = a;
-    result->src1 = b;
+    result->src[0] = a;
+    result->src[1] = b;
+    result->src[2] = c;
+
+    return result;
+
+}
+
+// ggml_conv_1d_ph
+
+struct ggml_tensor* ggml_conv_1d_ph(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        struct ggml_tensor  * b,
+        int                   s,
+        int                   d) {
+    return ggml_conv_1d(ctx, a, b, s, a->ne[0] / 2, d);
+}
+
+
+// ggml_pool_*
+
+static int64_t ggml_calc_pool_output_size(int64_t ins, int ks, int s, int p) {
+    return (ins + 2 * p - ks) / s + 1;
+}
+
+// ggml_pool_2d
+
+struct ggml_tensor* ggml_pool_1d(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        enum ggml_op_pool     op,
+        int                   k0,
+        int                   s0,
+        int                   p0) {
+
+    bool is_node = false;
+
+    if (a->grad) {
+        GGML_ASSERT(false); // TODO: implement backward
+        is_node = true;
+    }
+
+    const int64_t ne[3] = {
+        ggml_calc_pool_output_size(a->ne[0], k0, s0, p0),
+        a->ne[1],
+    };
+    struct ggml_tensor* result = ggml_new_tensor(ctx, GGML_TYPE_F32, 2, ne);
+
+    ggml_scratch_save(ctx);
+    struct ggml_tensor* c = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 4);
+    ((int32_t*)c->data)[0] = op;
+    ((int32_t*)c->data)[1] = k0;
+    ((int32_t*)c->data)[2] = s0;
+    ((int32_t*)c->data)[3] = p0;
+    ggml_scratch_load(ctx);
+
+    result->op = GGML_OP_POOL_1D;
+    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
+    result->src[0] = a;
+    result->src[1] = c;
+
+    return result;
+}
+
+// ggml_pool_2d
+
+struct ggml_tensor* ggml_pool_2d(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        enum ggml_op_pool     op,
+        int                   k0,
+        int                   k1,
+        int                   s0,
+        int                   s1,
+        int                   p0,
+        int                   p1) {
+
+    bool is_node = false;
+
+    if (a->grad) {
+        GGML_ASSERT(false); // TODO: implement backward
+        is_node = true;
+    }
+
+    const int64_t ne[3] = {
+        ggml_calc_pool_output_size(a->ne[0], k0, s0, p0),
+        ggml_calc_pool_output_size(a->ne[1], k1, s1, p1),
+        a->ne[2],
+    };
+    struct ggml_tensor* result = ggml_new_tensor(ctx, GGML_TYPE_F32, 3, ne);
+
+    ggml_scratch_save(ctx);
+    struct ggml_tensor* c = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 7);
+    ((int32_t*)c->data)[0] = op;
+    ((int32_t*)c->data)[1] = k0;
+    ((int32_t*)c->data)[2] = k1;
+    ((int32_t*)c->data)[3] = s0;
+    ((int32_t*)c->data)[4] = s1;
+    ((int32_t*)c->data)[5] = p0;
+    ((int32_t*)c->data)[6] = p1;
+    ggml_scratch_load(ctx);
+
+    result->op = GGML_OP_POOL_2D;
+    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
+    result->src[0] = a;
+    result->src[1] = c;
 
     return result;
 }
@@ -7045,10 +7354,10 @@ struct ggml_tensor * ggml_flash_attn(
 
     result->op   = GGML_OP_FLASH_ATTN;
     result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src0 = q;
-    result->src1 = k;
-    result->opt[0] = v;
-    result->opt[1] = ggml_new_i32(ctx, masked ? 1 : 0);
+    result->src[0] = q;
+    result->src[1] = k;
+    result->src[2] = v;
+    result->src[3] = ggml_new_i32(ctx, masked ? 1 : 0);
 
     return result;
 }
@@ -7076,11 +7385,11 @@ struct ggml_tensor * ggml_flash_ff(
 
     result->op   = GGML_OP_FLASH_FF;
     result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src0 = a;
-    result->src1 = b0;
-    result->opt[0] = b1;
-    result->opt[1] = c0;
-    result->opt[2] = c1;
+    result->src[0] = a;
+    result->src[1] = b0;
+    result->src[2] = b1;
+    result->src[3] = c0;
+    result->src[4] = c1;
 
     return result;
 }
@@ -7140,11 +7449,11 @@ struct ggml_tensor * ggml_flash_attn_back(
 
     result->op   = GGML_OP_FLASH_ATTN_BACK;
     result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src0 = q;
-    result->src1 = k;
-    result->opt[0] = v;
-    result->opt[1] = d;
-    result->opt[2] = ggml_new_i32(ctx, masked ? 1 : 0);
+    result->src[0] = q;
+    result->src[1] = k;
+    result->src[2] = v;
+    result->src[3] = d;
+    result->src[4] = ggml_new_i32(ctx, masked ? 1 : 0);
 
     return result;
 }
@@ -7189,9 +7498,9 @@ struct ggml_tensor * ggml_win_part(
 
     result->op   = GGML_OP_WIN_PART;
     result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src0 = a;
-    result->src1 = NULL;
-    result->opt[0] = b;
+    result->src[0] = a;
+    result->src[1] = NULL;
+    result->src[2] = b;
 
     return result;
 }
@@ -7226,9 +7535,9 @@ struct ggml_tensor * ggml_win_unpart(
 
     result->op   = GGML_OP_WIN_UNPART;
     result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src0 = a;
-    result->src1 = NULL;
-    result->opt[0] = b;
+    result->src[0] = a;
+    result->src[1] = NULL;
+    result->src[2] = b;
 
     return result;
 }
@@ -7257,8 +7566,8 @@ struct ggml_tensor * ggml_map_unary_impl_f32(
 
     result->op = GGML_OP_MAP_UNARY;
     result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src0 = a;
-    result->opt[0] = addr_tensor;
+    result->src[0] = a;
+    result->src[2] = addr_tensor;
 
     return result;
 }
@@ -7304,9 +7613,9 @@ struct ggml_tensor * ggml_map_binary_impl_f32(
 
     result->op = GGML_OP_MAP_BINARY;
     result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src0 = a;
-    result->src1 = b;
-    result->opt[0] = addr_tensor;
+    result->src[0] = a;
+    result->src[1] = b;
+    result->src[2] = addr_tensor;
 
     return result;
 }
@@ -7351,8 +7660,8 @@ struct ggml_tensor * ggml_map_custom1_impl_f32(
 
     result->op = GGML_OP_MAP_CUSTOM1;
     result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src0 = a;
-    result->opt[0] = addr_tensor;
+    result->src[0] = a;
+    result->src[2] = addr_tensor;
 
     return result;
 }
@@ -7396,9 +7705,9 @@ struct ggml_tensor * ggml_map_custom2_impl_f32(
 
     result->op = GGML_OP_MAP_CUSTOM2;
     result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src0 = a;
-    result->src1 = b;
-    result->opt[0] = addr_tensor;
+    result->src[0] = a;
+    result->src[1] = b;
+    result->src[2] = addr_tensor;
 
     return result;
 }
@@ -7445,10 +7754,10 @@ struct ggml_tensor * ggml_map_custom3_impl_f32(
 
     result->op = GGML_OP_MAP_CUSTOM3;
     result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src0 = a;
-    result->src1 = b;
-    result->opt[0] = addr_tensor;
-    result->opt[1] = c;
+    result->src[0] = a;
+    result->src[1] = b;
+    result->src[2] = addr_tensor;
+    result->src[3] = c;
 
     return result;
 }
@@ -7488,8 +7797,8 @@ struct ggml_tensor * ggml_cross_entropy_loss(
 
     result->op   = GGML_OP_CROSS_ENTROPY_LOSS;
     result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src0 = a;
-    result->src1 = b;
+    result->src[0] = a;
+    result->src[1] = b;
 
     return result;
 }
@@ -7508,9 +7817,9 @@ struct ggml_tensor * ggml_cross_entropy_loss_back(
 
     result->op   = GGML_OP_CROSS_ENTROPY_LOSS_BACK;
     result->grad = NULL;
-    result->src0 = a;
-    result->src1 = b;
-    result->opt[0] = c;
+    result->src[0] = a;
+    result->src[1] = b;
+    result->src[2] = c;
 
     return result;
 }
@@ -7570,25 +7879,7 @@ static void ggml_compute_forward_dup_f16(
         return;
     }
 
-    const int64_t ne00 = src0->ne[0];
-    const int64_t ne01 = src0->ne[1];
-    const int64_t ne02 = src0->ne[2];
-    const int64_t ne03 = src0->ne[3];
-
-    const int64_t ne0 = dst->ne[0];
-    const int64_t ne1 = dst->ne[1];
-    const int64_t ne2 = dst->ne[2];
-    const int64_t ne3 = dst->ne[3];
-
-    const size_t nb00 = src0->nb[0];
-    const size_t nb01 = src0->nb[1];
-    const size_t nb02 = src0->nb[2];
-    const size_t nb03 = src0->nb[3];
-
-    const size_t nb0 = dst->nb[0];
-    const size_t nb1 = dst->nb[1];
-    const size_t nb2 = dst->nb[2];
-    const size_t nb3 = dst->nb[3];
+    GGML_TENSOR_UNARY_OP_LOCALS;
 
     const int ith = params->ith; // thread index
     const int nth = params->nth; // number of threads
@@ -7661,8 +7952,8 @@ static void ggml_compute_forward_dup_f16(
                         id += ne00 * (ne01 - ir1);
                     }
                 }
-            } else if (ggml_is_quantized(dst->type)) {
-                quantize_row_q_t const quantize_row_q = quantize_fns[dst->type].quantize_row_q;
+            } else if (type_traits[dst->type].from_float) {
+                ggml_from_float_t const quantize_row_q = type_traits[dst->type].from_float;
                 float * src0_f32 = (float *) params->wdata + (ne00 + CACHE_LINE_SIZE_F32) * ith;
 
                 size_t id = 0;
@@ -7859,25 +8150,7 @@ static void ggml_compute_forward_dup_f32(
         return;
     }
 
-    const int64_t ne00 = src0->ne[0];
-    const int64_t ne01 = src0->ne[1];
-    const int64_t ne02 = src0->ne[2];
-    const int64_t ne03 = src0->ne[3];
-
-    const int64_t ne0 = dst->ne[0];
-    const int64_t ne1 = dst->ne[1];
-    const int64_t ne2 = dst->ne[2];
-    const int64_t ne3 = dst->ne[3];
-
-    const size_t nb00 = src0->nb[0];
-    const size_t nb01 = src0->nb[1];
-    const size_t nb02 = src0->nb[2];
-    const size_t nb03 = src0->nb[3];
-
-    const size_t nb0 = dst->nb[0];
-    const size_t nb1 = dst->nb[1];
-    const size_t nb2 = dst->nb[2];
-    const size_t nb3 = dst->nb[3];
+    GGML_TENSOR_UNARY_OP_LOCALS;
 
     const int ith = params->ith; // thread index
     const int nth = params->nth; // number of threads
@@ -7932,26 +8205,8 @@ static void ggml_compute_forward_dup_f32(
                         id += rs * (ne01 - ir1);
                     }
                 }
-            } else if (dst->type == GGML_TYPE_F16) {
-                size_t id = 0;
-                ggml_fp16_t * dst_ptr = (ggml_fp16_t *) dst->data;
-
-                for (int i03 = 0; i03 < ne03; i03++) {
-                    for (int i02 = 0; i02 < ne02; i02++) {
-                        id += ne00 * ir0;
-                        for (int i01 = ir0; i01 < ir1; i01++) {
-                            for (int i00 = 0; i00 < ne00; i00++) {
-                                const float * src0_ptr = (float *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
-
-                                dst_ptr[id] = GGML_FP32_TO_FP16(*src0_ptr);
-                                id++;
-                            }
-                        }
-                        id += ne00 * (ne01 - ir1);
-                    }
-                }
-            } else if (ggml_is_quantized(dst->type)) {
-                quantize_row_q_t const quantize_row_q = quantize_fns[dst->type].quantize_row_q;
+            } else if (type_traits[dst->type].from_float) {
+                ggml_from_float_t const quantize_row_q = type_traits[dst->type].from_float;
 
                 size_t id = 0;
                 size_t rs = nb0 * (ne00 / GGML_BLCK_SIZE[dst->type]);
@@ -8165,7 +8420,7 @@ static void ggml_compute_forward_add_f32(
         const struct ggml_tensor * src0,
         const struct ggml_tensor * src1,
         struct ggml_tensor * dst) {
-    GGML_ASSERT(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst));
+    GGML_ASSERT(ggml_can_repeat_rows(src1, src0) && ggml_are_same_shape(src0, dst));
 
     if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
         return;
@@ -8175,24 +8430,8 @@ static void ggml_compute_forward_add_f32(
     const int nth = params->nth;
 
     const int nr  = ggml_nrows(src0);
-    const int64_t ne0 = src0->ne[0];
-    const int64_t ne1 = src0->ne[1];
-    const int64_t ne2 = src0->ne[2];
 
-    const size_t nb00 = src0->nb[0];
-    const size_t nb01 = src0->nb[1];
-    const size_t nb02 = src0->nb[2];
-    const size_t nb03 = src0->nb[3];
-
-    const size_t nb10 = src1->nb[0];
-    const size_t nb11 = src1->nb[1];
-    const size_t nb12 = src1->nb[2];
-    const size_t nb13 = src1->nb[3];
-
-    const size_t nb0 = dst->nb[0];
-    const size_t nb1 = dst->nb[1];
-    const size_t nb2 = dst->nb[2];
-    const size_t nb3 = dst->nb[3];
+    GGML_TENSOR_BINARY_OP_LOCALS;
 
     GGML_ASSERT( nb0 == sizeof(float));
     GGML_ASSERT(nb00 == sizeof(float));
@@ -8206,23 +8445,23 @@ static void ggml_compute_forward_add_f32(
 
     if (nb10 == sizeof(float)) {
         for (int ir = ir0; ir < ir1; ++ir) {
-            // src0, src1 and dst are same shape => same indices
-            const int i3 = ir/(ne2*ne1);
-            const int i2 = (ir - i3*ne2*ne1)/ne1;
-            const int i1 = (ir - i3*ne2*ne1 - i2*ne1);
+            // src1 is broadcastable across src0 and dst in i1, i2, i3
+            const int64_t i03 = ir/(ne02*ne01);
+            const int64_t i02 = (ir - i03*ne02*ne01)/ne01;
+            const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01);
 
+            const int64_t i13 = i03 % ne13;
+            const int64_t i12 = i02 % ne12;
+            const int64_t i11 = i01 % ne11;
+
+            float * dst_ptr  = (float *) ((char *) dst->data  + i03*nb3  + i02*nb2  + i01*nb1 );
+            float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
+            float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11);
 
 #ifdef GGML_USE_ACCELERATE
-            vDSP_vadd(
-                    (float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01), 1,
-                    (float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11), 1,
-                    (float *) ((char *) dst->data  + i3*nb3  + i2*nb2  + i1*nb1 ), 1,
-                    ne0);
+            vDSP_vadd(src0_ptr, 1, src1_ptr, 1, dst_ptr, 1, ne00);
 #else
-            ggml_vec_add_f32(ne0,
-                    (float *) ((char *) dst->data  + i3*nb3  + i2*nb2  + i1*nb1 ),
-                    (float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01),
-                    (float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11));
+            ggml_vec_add_f32(ne00, dst_ptr, src0_ptr, src1_ptr);
 #endif
                 // }
             // }
@@ -8230,15 +8469,20 @@ static void ggml_compute_forward_add_f32(
     } else {
         // src1 is not contiguous
         for (int ir = ir0; ir < ir1; ++ir) {
-            // src0, src1 and dst are same shape => same indices
-            const int i3 = ir/(ne2*ne1);
-            const int i2 = (ir - i3*ne2*ne1)/ne1;
-            const int i1 = (ir - i3*ne2*ne1 - i2*ne1);
+            // src1 is broadcastable across src0 and dst in i1, i2, i3
+            const int64_t i03 = ir/(ne02*ne01);
+            const int64_t i02 = (ir - i03*ne02*ne01)/ne01;
+            const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01);
+
+            const int64_t i13 = i03 % ne13;
+            const int64_t i12 = i02 % ne12;
+            const int64_t i11 = i01 % ne11;
+
+            float * dst_ptr  = (float *) ((char *) dst->data  + i03*nb3  + i02*nb2  + i01*nb1 );
+            float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
 
-            float * dst_ptr  = (float *) ((char *) dst->data  + i3*nb3  + i2*nb2  + i1*nb1 );
-            float * src0_ptr = (float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01);
             for (int i0 = 0; i0 < ne0; i0++) {
-                float * src1_ptr = (float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11 + i0*nb10);
+                float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11 + i0*nb10);
 
                 dst_ptr[i0] = src0_ptr[i0] + *src1_ptr;
             }
@@ -8261,28 +8505,12 @@ static void ggml_compute_forward_add_f16_f32(
     const int nth = params->nth;
 
     const int nr  = ggml_nrows(src0);
-    const int64_t ne0 = src0->ne[0];
-    const int64_t ne1 = src0->ne[1];
-    const int64_t ne2 = src0->ne[2];
 
-    const size_t nb00 = src0->nb[0];
-    const size_t nb01 = src0->nb[1];
-    const size_t nb02 = src0->nb[2];
-    const size_t nb03 = src0->nb[3];
-
-    const size_t nb10 = src1->nb[0];
-    const size_t nb11 = src1->nb[1];
-    const size_t nb12 = src1->nb[2];
-    const size_t nb13 = src1->nb[3];
-
-    const size_t nb0 = dst->nb[0];
-    const size_t nb1 = dst->nb[1];
-    const size_t nb2 = dst->nb[2];
-    const size_t nb3 = dst->nb[3];
+    GGML_TENSOR_BINARY_OP_LOCALS;
 
     GGML_ASSERT(src0->type == GGML_TYPE_F16);
     GGML_ASSERT(src1->type == GGML_TYPE_F32);
-    GGML_ASSERT(dst->type == GGML_TYPE_F16);
+    GGML_ASSERT(dst->type  == GGML_TYPE_F16);
 
     GGML_ASSERT( nb0 == sizeof(ggml_fp16_t));
     GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
@@ -8331,24 +8559,8 @@ static void ggml_compute_forward_add_f16_f16(
     const int nth = params->nth;
 
     const int nr  = ggml_nrows(src0);
-    const int64_t ne0 = src0->ne[0];
-    const int64_t ne1 = src0->ne[1];
-    const int64_t ne2 = src0->ne[2];
 
-    const size_t nb00 = src0->nb[0];
-    const size_t nb01 = src0->nb[1];
-    const size_t nb02 = src0->nb[2];
-    const size_t nb03 = src0->nb[3];
-
-    const size_t nb10 = src1->nb[0];
-    const size_t nb11 = src1->nb[1];
-    const size_t nb12 = src1->nb[2];
-    const size_t nb13 = src1->nb[3];
-
-    const size_t nb0 = dst->nb[0];
-    const size_t nb1 = dst->nb[1];
-    const size_t nb2 = dst->nb[2];
-    const size_t nb3 = dst->nb[3];
+    GGML_TENSOR_BINARY_OP_LOCALS;
 
     GGML_ASSERT(src0->type == GGML_TYPE_F16);
     GGML_ASSERT(src1->type == GGML_TYPE_F16);
@@ -8398,32 +8610,15 @@ static void ggml_compute_forward_add_q_f32(
     }
 
     const int nr  = ggml_nrows(src0);
-    const int64_t ne00 = src0->ne[0];
-    const int64_t ne01 = src0->ne[1];
-    const int64_t ne02 = src0->ne[2];
-    //const int64_t ne03 = src0->ne[3];
 
-    const size_t nb00 = src0->nb[0];
-    const size_t nb01 = src0->nb[1];
-    const size_t nb02 = src0->nb[2];
-    const size_t nb03 = src0->nb[3];
-
-    const size_t nb10 = src1->nb[0];
-    const size_t nb11 = src1->nb[1];
-    const size_t nb12 = src1->nb[2];
-    const size_t nb13 = src1->nb[3];
-
-    const size_t nb0  = dst->nb[0];
-    const size_t nb1  = dst->nb[1];
-    const size_t nb2  = dst->nb[2];
-    const size_t nb3  = dst->nb[3];
+    GGML_TENSOR_BINARY_OP_LOCALS;
 
     const int ith = params->ith;
     const int nth = params->nth;
 
     const enum ggml_type type = src0->type;
-    dequantize_row_q_t const dequantize_row_q = quantize_fns[type].dequantize_row_q;
-    quantize_row_q_t const quantize_row_q = quantize_fns[type].quantize_row_q;
+    ggml_to_float_t const dequantize_row_q = type_traits[type].to_float;
+    ggml_from_float_t const quantize_row_q = type_traits[type].from_float;
 
     // we don't support permuted src0 or src1
     GGML_ASSERT(nb00 == GGML_TYPE_SIZE[type]);
@@ -8537,19 +8732,8 @@ static void ggml_compute_forward_add1_f32(
     const int nth = params->nth;
 
     const int nr  = ggml_nrows(src0);
-    const int64_t ne0 = src0->ne[0];
-    const int64_t ne1 = src0->ne[1];
-    const int64_t ne2 = src0->ne[2];
 
-    const size_t nb00 = src0->nb[0];
-    const size_t nb01 = src0->nb[1];
-    const size_t nb02 = src0->nb[2];
-    const size_t nb03 = src0->nb[3];
-
-    const size_t nb0 = dst->nb[0];
-    const size_t nb1 = dst->nb[1];
-    const size_t nb2 = dst->nb[2];
-    const size_t nb3 = dst->nb[3];
+    GGML_TENSOR_UNARY_OP_LOCALS;
 
     GGML_ASSERT( nb0 == sizeof(float));
     GGML_ASSERT(nb00 == sizeof(float));
@@ -8603,23 +8787,12 @@ static void ggml_compute_forward_add1_f16_f32(
     const int nth = params->nth;
 
     const int nr  = ggml_nrows(src0);
-    const int64_t ne0 = src0->ne[0];
-    const int64_t ne1 = src0->ne[1];
-    const int64_t ne2 = src0->ne[2];
 
-    const size_t nb00 = src0->nb[0];
-    const size_t nb01 = src0->nb[1];
-    const size_t nb02 = src0->nb[2];
-    const size_t nb03 = src0->nb[3];
-
-    const size_t nb0 = dst->nb[0];
-    const size_t nb1 = dst->nb[1];
-    const size_t nb2 = dst->nb[2];
-    const size_t nb3 = dst->nb[3];
+    GGML_TENSOR_UNARY_OP_LOCALS;
 
     GGML_ASSERT(src0->type == GGML_TYPE_F16);
     GGML_ASSERT(src1->type == GGML_TYPE_F32);
-    GGML_ASSERT(dst->type == GGML_TYPE_F16);
+    GGML_ASSERT(dst->type  == GGML_TYPE_F16);
 
     GGML_ASSERT( nb0 == sizeof(ggml_fp16_t));
     GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
@@ -8664,23 +8837,12 @@ static void ggml_compute_forward_add1_f16_f16(
     const int nth = params->nth;
 
     const int nr  = ggml_nrows(src0);
-    const int64_t ne0 = src0->ne[0];
-    const int64_t ne1 = src0->ne[1];
-    const int64_t ne2 = src0->ne[2];
 
-    const size_t nb00 = src0->nb[0];
-    const size_t nb01 = src0->nb[1];
-    const size_t nb02 = src0->nb[2];
-    const size_t nb03 = src0->nb[3];
-
-    const size_t nb0 = dst->nb[0];
-    const size_t nb1 = dst->nb[1];
-    const size_t nb2 = dst->nb[2];
-    const size_t nb3 = dst->nb[3];
+    GGML_TENSOR_UNARY_OP_LOCALS;
 
     GGML_ASSERT(src0->type == GGML_TYPE_F16);
     GGML_ASSERT(src1->type == GGML_TYPE_F16);
-    GGML_ASSERT(dst->type == GGML_TYPE_F16);
+    GGML_ASSERT(dst->type  == GGML_TYPE_F16);
 
     GGML_ASSERT( nb0 == sizeof(ggml_fp16_t));
     GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
@@ -8725,23 +8887,12 @@ static void ggml_compute_forward_add1_q_f32(
     const int nth = params->nth;
 
     const int nr  = ggml_nrows(src0);
-    const int64_t ne0 = src0->ne[0];
-    const int64_t ne1 = src0->ne[1];
-    const int64_t ne2 = src0->ne[2];
 
-    const size_t nb00 = src0->nb[0];
-    const size_t nb01 = src0->nb[1];
-    const size_t nb02 = src0->nb[2];
-    const size_t nb03 = src0->nb[3];
-
-    const size_t nb0 = dst->nb[0];
-    const size_t nb1 = dst->nb[1];
-    const size_t nb2 = dst->nb[2];
-    const size_t nb3 = dst->nb[3];
+    GGML_TENSOR_UNARY_OP_LOCALS;
 
     const enum ggml_type type = src0->type;
-    dequantize_row_q_t const dequantize_row_q = quantize_fns[type].dequantize_row_q;
-    quantize_row_q_t const quantize_row_q = quantize_fns[type].quantize_row_q;
+    ggml_to_float_t const dequantize_row_q = type_traits[type].to_float;
+    ggml_from_float_t const quantize_row_q = type_traits[type].from_float;
 
     // we don't support permuted src0
     GGML_ASSERT(nb00 == GGML_TYPE_SIZE[type]);
@@ -8869,15 +9020,8 @@ static void ggml_compute_forward_acc_f32(
     const int nr = ggml_nrows(src1);
     const int nc = src1->ne[0];
 
-    const int64_t ne10 = src1->ne[0];
-    const int64_t ne11 = src1->ne[1];
-    const int64_t ne12 = src1->ne[2];
-    const int64_t ne13 = src1->ne[3];
-
-    const size_t nb10 = src1->nb[0];
-    const size_t nb11 = src1->nb[1];
-    const size_t nb12 = src1->nb[2];
-    const size_t nb13 = src1->nb[3];
+    GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne);
+    GGML_TENSOR_LOCALS(size_t,  nb1, src1, nb);
 
     // src0 and dst as viewed during acc
     const size_t nb0 = ggml_element_size(src0);
@@ -8966,24 +9110,8 @@ static void ggml_compute_forward_sub_f32(
     }
 
     const int nr  = ggml_nrows(src0);
-    const int64_t ne0 = src0->ne[0];
-    const int64_t ne1 = src0->ne[1];
-    const int64_t ne2 = src0->ne[2];
 
-    const size_t nb00 = src0->nb[0];
-    const size_t nb01 = src0->nb[1];
-    const size_t nb02 = src0->nb[2];
-    const size_t nb03 = src0->nb[3];
-
-    const size_t nb10 = src1->nb[0];
-    const size_t nb11 = src1->nb[1];
-    const size_t nb12 = src1->nb[2];
-    const size_t nb13 = src1->nb[3];
-
-    const size_t nb0 = dst->nb[0];
-    const size_t nb1 = dst->nb[1];
-    const size_t nb2 = dst->nb[2];
-    const size_t nb3 = dst->nb[3];
+    GGML_TENSOR_BINARY_OP_LOCALS;
 
     GGML_ASSERT( nb0 == sizeof(float));
     GGML_ASSERT(nb00 == sizeof(float));
@@ -9073,29 +9201,7 @@ static void ggml_compute_forward_mul_f32(
 
     const int64_t nr = ggml_nrows(src0);
 
-    const int64_t ne00 = src0->ne[0];
-    const int64_t ne01 = src0->ne[1];
-    const int64_t ne02 = src0->ne[2];
-
-    const int64_t ne10 = src1->ne[0];
-    const int64_t ne11 = src1->ne[1];
-    const int64_t ne12 = src1->ne[2];
-    const int64_t ne13 = src1->ne[3];
-
-    const size_t nb00 = src0->nb[0];
-    const size_t nb01 = src0->nb[1];
-    const size_t nb02 = src0->nb[2];
-    const size_t nb03 = src0->nb[3];
-
-    const size_t nb10 = src1->nb[0];
-    const size_t nb11 = src1->nb[1];
-    const size_t nb12 = src1->nb[2];
-    const size_t nb13 = src1->nb[3];
-
-    const size_t nb0 = dst->nb[0];
-    const size_t nb1 = dst->nb[1];
-    const size_t nb2 = dst->nb[2];
-    const size_t nb3 = dst->nb[3];
+    GGML_TENSOR_BINARY_OP_LOCALS;
 
     GGML_ASSERT( nb0 == sizeof(float));
     GGML_ASSERT(nb00 == sizeof(float));
@@ -9183,24 +9289,8 @@ static void ggml_compute_forward_div_f32(
     }
 
     const int nr  = ggml_nrows(src0);
-    const int64_t ne0 = src0->ne[0];
-    const int64_t ne1 = src0->ne[1];
-    const int64_t ne2 = src0->ne[2];
 
-    const size_t nb00 = src0->nb[0];
-    const size_t nb01 = src0->nb[1];
-    const size_t nb02 = src0->nb[2];
-    const size_t nb03 = src0->nb[3];
-
-    const size_t nb10 = src1->nb[0];
-    const size_t nb11 = src1->nb[1];
-    const size_t nb12 = src1->nb[2];
-    const size_t nb13 = src1->nb[3];
-
-    const size_t nb0 = dst->nb[0];
-    const size_t nb1 = dst->nb[1];
-    const size_t nb2 = dst->nb[2];
-    const size_t nb3 = dst->nb[3];
+    GGML_TENSOR_BINARY_OP_LOCALS;
 
     GGML_ASSERT( nb0 == sizeof(float));
     GGML_ASSERT(nb00 == sizeof(float));
@@ -9407,14 +9497,8 @@ static void ggml_compute_forward_sum_f32(
     assert(ggml_is_scalar(dst));
     assert(src0->nb[0] == sizeof(float));
 
-    const int64_t ne00 = src0->ne[0];
-    const int64_t ne01 = src0->ne[1];
-    const int64_t ne02 = src0->ne[2];
-    const int64_t ne03 = src0->ne[3];
-
-    const size_t nb01 = src0->nb[1];
-    const size_t nb02 = src0->nb[2];
-    const size_t nb03 = src0->nb[3];
+    GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne);
+    GGML_TENSOR_LOCALS(size_t,  nb0, src0, nb);
 
     ggml_float sum     = 0;
     ggml_float row_sum = 0;
@@ -9463,29 +9547,13 @@ static void ggml_compute_forward_sum_rows_f32(
     GGML_ASSERT(src0->nb[0] == sizeof(float));
     GGML_ASSERT(dst->nb[0] == sizeof(float));
 
-    const int64_t ne00 = src0->ne[0];
-    const int64_t ne01 = src0->ne[1];
-    const int64_t ne02 = src0->ne[2];
-    const int64_t ne03 = src0->ne[3];
-
-    const int64_t ne0 = dst->ne[0];
-    const int64_t ne1 = dst->ne[1];
-    const int64_t ne2 = dst->ne[2];
-    const int64_t ne3 = dst->ne[3];
+    GGML_TENSOR_UNARY_OP_LOCALS;
 
     GGML_ASSERT(ne0 == 1);
     GGML_ASSERT(ne1 == ne01);
     GGML_ASSERT(ne2 == ne02);
     GGML_ASSERT(ne3 == ne03);
 
-    const size_t nb01 = src0->nb[1];
-    const size_t nb02 = src0->nb[2];
-    const size_t nb03 = src0->nb[3];
-
-    const size_t nb1 = dst->nb[1];
-    const size_t nb2 = dst->nb[2];
-    const size_t nb3 = dst->nb[3];
-
     for (int64_t i3 = 0; i3 < ne03; i3++) {
         for (int64_t i2 = 0; i2 < ne02; i2++) {
             for (int64_t i1 = 0; i1 < ne01; i1++) {
@@ -9529,19 +9597,7 @@ static void ggml_compute_forward_mean_f32(
 
     assert(src0->nb[0] == sizeof(float));
 
-    const int64_t ne00 = src0->ne[0];
-    const int64_t ne01 = src0->ne[1];
-    const int64_t ne02 = src0->ne[2];
-    const int64_t ne03 = src0->ne[3];
-
-    const size_t nb01 = src0->nb[1];
-    const size_t nb02 = src0->nb[2];
-    const size_t nb03 = src0->nb[3];
-
-    const int64_t ne0 = dst->ne[0];
-    const int64_t ne1 = dst->ne[1];
-    const int64_t ne2 = dst->ne[2];
-    const int64_t ne3 = dst->ne[3];
+    GGML_TENSOR_UNARY_OP_LOCALS;
 
     assert(ne0 == 1);
     assert(ne1 == ne01);
@@ -9553,10 +9609,6 @@ static void ggml_compute_forward_mean_f32(
     UNUSED(ne2);
     UNUSED(ne3);
 
-    const size_t nb1 = dst->nb[1];
-    const size_t nb2 = dst->nb[2];
-    const size_t nb3 = dst->nb[3];
-
     for (int64_t i03 = 0; i03 < ne03; i03++) {
         for (int64_t i02 = 0; i02 < ne02; i02++) {
             for (int64_t i01 = 0; i01 < ne01; i01++) {
@@ -9586,6 +9638,52 @@ static void ggml_compute_forward_mean(
     }
 }
 
+// ggml_compute_forward_argmax
+
+static void ggml_compute_forward_argmax_f32(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * src0,
+        struct ggml_tensor * dst) {
+    assert(params->ith == 0);
+
+    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
+        return;
+    }
+
+    assert(src0->nb[0] == sizeof(float));
+    assert(dst->nb[0] == sizeof(float));
+
+    const int64_t ne00 = src0->ne[0];
+    const int64_t ne01 = src0->ne[1];
+
+    const size_t nb01 = src0->nb[1];
+    const size_t nb0 = dst->nb[0];
+
+    for (int64_t i1 = 0; i1 < ne01; i1++) {
+        float * src = (float *) ((char *) src0->data + i1*nb01);
+        int32_t * dst_ = (int32_t *) ((char *)  dst->data + i1*nb0);
+        int v = 0;
+        ggml_vec_argmax_f32(ne00, &v, src);
+        dst_[0] = v;
+    }
+}
+
+static void ggml_compute_forward_argmax(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * src0,
+        struct ggml_tensor * dst) {
+    switch (src0->type) {
+        case GGML_TYPE_F32:
+            {
+                ggml_compute_forward_argmax_f32(params, src0, dst);
+            } break;
+        default:
+            {
+                GGML_ASSERT(false);
+            } break;
+    }
+}
+
 // ggml_compute_forward_repeat
 
 static void ggml_compute_forward_repeat_f32(
@@ -9599,25 +9697,7 @@ static void ggml_compute_forward_repeat_f32(
         return;
     }
 
-    const int64_t ne0  = dst->ne[0];
-    const int64_t ne1  = dst->ne[1];
-    const int64_t ne2  = dst->ne[2];
-    const int64_t ne3  = dst->ne[3];
-
-    const int64_t ne00 = src0->ne[0];
-    const int64_t ne01 = src0->ne[1];
-    const int64_t ne02 = src0->ne[2];
-    const int64_t ne03 = src0->ne[3];
-
-    const size_t nb0  = dst->nb[0];
-    const size_t nb1  = dst->nb[1];
-    const size_t nb2  = dst->nb[2];
-    const size_t nb3  = dst->nb[3];
-
-    const size_t nb00 = src0->nb[0];
-    const size_t nb01 = src0->nb[1];
-    const size_t nb02 = src0->nb[2];
-    const size_t nb03 = src0->nb[3];
+    GGML_TENSOR_UNARY_OP_LOCALS;
 
     // guaranteed to be an integer due to the check in ggml_can_repeat
     const int nr0 = (int)(ne0/ne00);
@@ -9678,25 +9758,7 @@ static void ggml_compute_forward_repeat_back_f32(
         return;
     }
 
-    const int64_t ne0  = dst->ne[0];
-    const int64_t ne1  = dst->ne[1];
-    const int64_t ne2  = dst->ne[2];
-    const int64_t ne3  = dst->ne[3];
-
-    const int64_t ne00 = src0->ne[0];
-    const int64_t ne01 = src0->ne[1];
-    const int64_t ne02 = src0->ne[2];
-    const int64_t ne03 = src0->ne[3];
-
-    const size_t nb0  = dst->nb[0];
-    const size_t nb1  = dst->nb[1];
-    const size_t nb2  = dst->nb[2];
-    const size_t nb3  = dst->nb[3];
-
-    const size_t nb00 = src0->nb[0];
-    const size_t nb01 = src0->nb[1];
-    const size_t nb02 = src0->nb[2];
-    const size_t nb03 = src0->nb[3];
+    GGML_TENSOR_UNARY_OP_LOCALS;
 
     // guaranteed to be an integer due to the check in ggml_can_repeat
     const int nr0 = (int)(ne00/ne0);
@@ -9926,6 +9988,90 @@ static void ggml_compute_forward_step(
     }
 }
 
+// ggml_compute_forward_tanh
+
+static void ggml_compute_forward_tanh_f32(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * src0,
+        struct ggml_tensor * dst) {
+    assert(params->ith == 0);
+    assert(ggml_are_same_shape(src0, dst));
+
+    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
+        return;
+    }
+
+    const int n  = ggml_nrows(src0);
+    const int nc = src0->ne[0];
+
+    assert(dst->nb[0]  == sizeof(float));
+    assert(src0->nb[0] == sizeof(float));
+
+    for (int i = 0; i < n; i++) {
+        ggml_vec_tanh_f32(nc,
+                (float *) ((char *) dst->data  + i*( dst->nb[1])),
+                (float *) ((char *) src0->data + i*(src0->nb[1])));
+    }
+}
+
+static void ggml_compute_forward_tanh(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * src0,
+        struct ggml_tensor * dst) {
+    switch (src0->type) {
+        case GGML_TYPE_F32:
+            {
+                ggml_compute_forward_tanh_f32(params, src0, dst);
+            } break;
+        default:
+            {
+                GGML_ASSERT(false);
+            } break;
+    }
+}
+
+// ggml_compute_forward_elu
+
+static void ggml_compute_forward_elu_f32(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * src0,
+        struct ggml_tensor * dst) {
+    assert(params->ith == 0);
+    assert(ggml_are_same_shape(src0, dst));
+
+    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
+        return;
+    }
+
+    const int n  = ggml_nrows(src0);
+    const int nc = src0->ne[0];
+
+    assert(dst->nb[0]  == sizeof(float));
+    assert(src0->nb[0] == sizeof(float));
+
+    for (int i = 0; i < n; i++) {
+        ggml_vec_elu_f32(nc,
+                (float *) ((char *) dst->data  + i*( dst->nb[1])),
+                (float *) ((char *) src0->data + i*(src0->nb[1])));
+    }
+}
+
+static void ggml_compute_forward_elu(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * src0,
+        struct ggml_tensor * dst) {
+    switch (src0->type) {
+        case GGML_TYPE_F32:
+            {
+                ggml_compute_forward_elu_f32(params, src0, dst);
+            } break;
+        default:
+            {
+                GGML_ASSERT(false);
+            } break;
+    }
+}
+
 // ggml_compute_forward_relu
 
 static void ggml_compute_forward_relu_f32(
@@ -10227,18 +10373,7 @@ static void ggml_compute_forward_norm_f32(
     const int ith = params->ith;
     const int nth = params->nth;
 
-    const int64_t ne00 = src0->ne[0];
-    const int64_t ne01 = src0->ne[1];
-    const int64_t ne02 = src0->ne[2];
-    const int64_t ne03 = src0->ne[3];
-
-    const size_t nb01 = src0->nb[1];
-    const size_t nb02 = src0->nb[2];
-    const size_t nb03 = src0->nb[3];
-
-    const size_t nb1 = dst->nb[1];
-    const size_t nb2 = dst->nb[2];
-    const size_t nb3 = dst->nb[3];
+    GGML_TENSOR_UNARY_OP_LOCALS;
 
     const float eps = 1e-5f; // TODO: make this a parameter
 
@@ -10304,18 +10439,7 @@ static void ggml_compute_forward_rms_norm_f32(
     const int ith = params->ith;
     const int nth = params->nth;
 
-    const int64_t ne00 = src0->ne[0];
-    const int64_t ne01 = src0->ne[1];
-    const int64_t ne02 = src0->ne[2];
-    const int64_t ne03 = src0->ne[3];
-
-    const size_t nb01 = src0->nb[1];
-    const size_t nb02 = src0->nb[2];
-    const size_t nb03 = src0->nb[3];
-
-    const size_t nb1 = dst->nb[1];
-    const size_t nb2 = dst->nb[2];
-    const size_t nb3 = dst->nb[3];
+    GGML_TENSOR_UNARY_OP_LOCALS;
 
     const float eps = 1e-6f; // TODO: make this a parameter
 
@@ -10380,22 +10504,7 @@ static void ggml_compute_forward_rms_norm_back_f32(
     const int ith = params->ith;
     const int nth = params->nth;
 
-    const int64_t ne00 = src0->ne[0];
-    const int64_t ne01 = src0->ne[1];
-    const int64_t ne02 = src0->ne[2];
-    const int64_t ne03 = src0->ne[3];
-
-    const size_t nb01 = src0->nb[1];
-    const size_t nb02 = src0->nb[2];
-    const size_t nb03 = src0->nb[3];
-
-    const size_t nb11 = src1->nb[1];
-    const size_t nb12 = src1->nb[2];
-    const size_t nb13 = src1->nb[3];
-
-    const size_t nb1 = dst->nb[1];
-    const size_t nb2 = dst->nb[2];
-    const size_t nb3 = dst->nb[3];
+    GGML_TENSOR_BINARY_OP_LOCALS;
 
     const float eps = 1e-6f; // TODO: make this a parameter
 
@@ -10552,7 +10661,6 @@ static void ggml_compute_forward_rms_norm_back(
     }
 }
 
-
 // ggml_compute_forward_mul_mat
 
 #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
@@ -10583,7 +10691,7 @@ static bool ggml_compute_forward_mul_mat_use_blas(
 }
 #endif
 
-static void ggml_compute_forward_mul_mat_f32(
+static void ggml_compute_forward_mul_mat(
         const struct ggml_compute_params * params,
         const struct ggml_tensor * src0,
         const struct ggml_tensor * src1,
@@ -10591,436 +10699,26 @@ static void ggml_compute_forward_mul_mat_f32(
     int64_t t0 = ggml_perf_time_us();
     UNUSED(t0);
 
-    const int64_t ne00 = src0->ne[0];
-    const int64_t ne01 = src0->ne[1];
-    const int64_t ne02 = src0->ne[2];
-    const int64_t ne03 = src0->ne[3];
-
-#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
-    const int64_t ne10 = src1->ne[0];
-#endif
-    const int64_t ne11 = src1->ne[1];
-#ifndef NDEBUG
-    const int64_t ne12 = src1->ne[2];
-    const int64_t ne13 = src1->ne[3];
-
-    const int64_t ne0  = dst->ne[0];
-    const int64_t ne1  = dst->ne[1];
-    const int64_t ne2  = dst->ne[2];
-    const int64_t ne3  = dst->ne[3];
-
-    const int nb00 = src0->nb[0];
-#endif
-    const int nb01 = src0->nb[1];
-    const int nb02 = src0->nb[2];
-    const int nb03 = src0->nb[3];
-
-#ifndef NDEBUG
-    const int nb10 = src1->nb[0];
-#endif
-    const int nb11 = src1->nb[1];
-    const int nb12 = src1->nb[2];
-    const int nb13 = src1->nb[3];
-
-    const int nb0  = dst->nb[0];
-    const int nb1  = dst->nb[1];
-    const int nb2  = dst->nb[2];
-    const int nb3  = dst->nb[3];
+    GGML_TENSOR_BINARY_OP_LOCALS;
 
     const int ith = params->ith;
     const int nth = params->nth;
 
-    assert(ne02 == ne12);
-    assert(ne03 == ne13);
-    assert(ne2  == ne12);
-    assert(ne3  == ne13);
+    const enum ggml_type type = src0->type;
 
-    // we don't support permuted src0 or src1
-    assert(nb00 == sizeof(float));
-    assert(nb10 == sizeof(float));
+    const bool src1_cont = ggml_is_contiguous(src1);
 
-    // dst cannot be transposed or permuted
-    assert(nb0 == sizeof(float));
-    assert(nb0 <= nb1);
-    assert(nb1 <= nb2);
-    assert(nb2 <= nb3);
-
-    assert(ne0 == ne01);
-    assert(ne1 == ne11);
-    assert(ne2 == ne02);
-    assert(ne3 == ne03);
-
-    // nb01 >= nb00 - src0 is not transposed
-    //   compute by src0 rows
-
-#if defined(GGML_USE_CLBLAST)
-    if (ggml_cl_can_mul_mat(src0, src1, dst)) {
-        if (params->ith == 0 && params->type == GGML_TASK_COMPUTE) {
-            ggml_cl_mul_mat(src0, src1, dst, params->wdata, params->wsize);
-        }
-        return;
-    }
-#elif defined(GGML_USE_VULKAN)
-    if (ggml_vk_can_mul_mat(src0, src1, dst)) {
-        if (params->ith == 0 && params->type == GGML_TASK_COMPUTE) {
-            ggml_vk_mul_mat(src0, src1, dst, params->wdata, params->wsize);
-        }
-        return;
-    }
-#endif
-
-#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
-    if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) {
-        if (params->ith != 0) {
-            return;
-        }
-
-        if (params->type == GGML_TASK_INIT) {
-            return;
-        }
-
-        if (params->type == GGML_TASK_FINALIZE) {
-            return;
-        }
-
-        for (int64_t i03 = 0; i03 < ne03; i03++) {
-            for (int64_t i02 = 0; i02 < ne02; i02++) {
-                const float * x = (float *) ((char *) src0->data + i02*nb02 + i03*nb03);
-                const float * y = (float *) ((char *) src1->data + i02*nb12 + i03*nb13);
-                float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
-
-                cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
-                        ne11, ne01, ne10,
-                        1.0f,    y, ne10,
-                                 x, ne00,
-                        0.0f,    d, ne01);
-            }
-        }
-        //printf("CBLAS F32 = %f ms, %d x %d x %d x %d\n", (ggml_perf_time_us() - t0)/1000.0, ne0, ne1, ne2, ne3);
-
-        return;
-    }
-#endif
-
-    if (params->type == GGML_TASK_INIT) {
-        return;
-    }
-
-    if (params->type == GGML_TASK_FINALIZE) {
-        return;
-    }
-
-    // parallelize by src0 rows using ggml_vec_dot_f32
-
-    // total rows in src0
-    const int nr = ne01*ne02*ne03;
-
-    // rows per thread
-    const int dr = (nr + nth - 1)/nth;
-
-    // row range for this thread
-    const int ir0 = dr*ith;
-    const int ir1 = MIN(ir0 + dr, nr);
-
-    for (int ir = ir0; ir < ir1; ++ir) {
-        // src0 indices
-        const int i03 = ir/(ne02*ne01);
-        const int i02 = (ir - i03*ne02*ne01)/ne01;
-        const int i01 = (ir - i03*ne02*ne01 - i02*ne01);
-
-        for (int64_t ic = 0; ic < ne11; ++ic) {
-            // src1 indices
-            const int i13 = i03;
-            const int i12 = i02;
-            const int i11 = ic;
-
-            // dst indices
-            const int i0 = i01;
-            const int i1 = i11;
-            const int i2 = i02;
-            const int i3 = i03;
-
-            ggml_vec_dot_f32(ne00,
-                    (float *) ((char *)  dst->data + (i0*nb0 + i1*nb1 + i2*nb2 + i3*nb3)),
-                    (float *) ((char *) src0->data + (i01*nb01 + i02*nb02 + i03*nb03)),
-                    (float *) ((char *) src1->data + (i11*nb11 + i12*nb12 + i13*nb13)));
-        }
-    }
-
-    //int64_t t1 = ggml_perf_time_us();
-    //static int64_t acc = 0;
-    //acc += t1 - t0;
-    //if (t1 - t0 > 10) {
-    //    printf("\n");
-    //    printf("ne00 = %5d, ne01 = %5d, ne02 = %5d, ne03 = %5d\n", ne00, ne01, ne02, ne03);
-    //    printf("nb00 = %5d, nb01 = %5d, nb02 = %5d, nb03 = %5d\n", nb00, nb01, nb02, nb03);
-    //    printf("ne10 = %5d, ne11 = %5d, ne12 = %5d, ne13 = %5d\n", ne10, ne11, ne12, ne13);
-    //    printf("nb10 = %5d, nb11 = %5d, nb12 = %5d, nb13 = %5d\n", nb10, nb11, nb12, nb13);
-
-    //    printf("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX task %d/%d: %d us, acc = %d\n", ith, nth, (int) (t1 - t0), (int) acc);
-    //}
-}
-
-static void ggml_compute_forward_mul_mat_f16_f32(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        const struct ggml_tensor * src1,
-              struct ggml_tensor * dst) {
-    int64_t t0 = ggml_perf_time_us();
-    UNUSED(t0);
-
-    const int64_t ne00 = src0->ne[0];
-    const int64_t ne01 = src0->ne[1];
-    const int64_t ne02 = src0->ne[2];
-    const int64_t ne03 = src0->ne[3];
-
-    const int64_t ne10 = src1->ne[0];
-    const int64_t ne11 = src1->ne[1];
-    const int64_t ne12 = src1->ne[2];
-    const int64_t ne13 = src1->ne[3];
-
-    const int64_t ne0  = dst->ne[0];
-    const int64_t ne1  = dst->ne[1];
-    const int64_t ne2  = dst->ne[2];
-    const int64_t ne3  = dst->ne[3];
-    //const int64_t ne   = ne0*ne1*ne2*ne3;
-
-    const int nb00 = src0->nb[0];
-    const int nb01 = src0->nb[1];
-    const int nb02 = src0->nb[2];
-    const int nb03 = src0->nb[3];
-
-    const int nb10 = src1->nb[0];
-    const int nb11 = src1->nb[1];
-    const int nb12 = src1->nb[2];
-    const int nb13 = src1->nb[3];
-
-    const int nb0  = dst->nb[0];
-    const int nb1  = dst->nb[1];
-    const int nb2  = dst->nb[2];
-    const int nb3  = dst->nb[3];
-
-    const int ith = params->ith;
-    const int nth = params->nth;
-
-    GGML_ASSERT(ne02 == ne12);
-    GGML_ASSERT(ne03 == ne13);
-    GGML_ASSERT(ne2  == ne12);
-    GGML_ASSERT(ne3  == ne13);
-
-    // TODO: we don't support permuted src0
-    GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
-
-    // dst cannot be transposed or permuted
-    GGML_ASSERT(nb0 == sizeof(float));
-    GGML_ASSERT(nb0 <= nb1);
-    GGML_ASSERT(nb1 <= nb2);
-    GGML_ASSERT(nb2 <= nb3);
+    ggml_vec_dot_t    const vec_dot               = type_traits[type].vec_dot;
+    enum ggml_type    const vec_dot_type          = type_traits[type].vec_dot_type;
+    ggml_from_float_t const from_float_to_vec_dot = type_traits[vec_dot_type].from_float;
 
     GGML_ASSERT(ne0 == ne01);
     GGML_ASSERT(ne1 == ne11);
-    GGML_ASSERT(ne2 == ne02);
-    GGML_ASSERT(ne3 == ne03);
-
-    // nb01 >= nb00 - src0 is not transposed
-    //   compute by src0 rows
-
-#if defined(GGML_USE_CLBLAST)
-    if (ggml_cl_can_mul_mat(src0, src1, dst)) {
-        if (params->ith == 0 && params->type == GGML_TASK_COMPUTE) {
-            ggml_cl_mul_mat(src0, src1, dst, params->wdata, params->wsize);
-        }
-        return;
-    }
-#elif defined(GGML_USE_VULKAN)
-    if (ggml_vk_can_mul_mat(src0, src1, dst)) {
-        if (params->ith == 0 && params->type == GGML_TASK_COMPUTE) {
-            ggml_vk_mul_mat(src0, src1, dst, params->wdata, params->wsize);
-        }
-        return;
-    }
-#endif
-
-#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
-    if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) {
-        GGML_ASSERT(nb10 == sizeof(float));
-
-        if (params->ith != 0) {
-            return;
-        }
-
-        if (params->type == GGML_TASK_INIT) {
-            return;
-        }
-
-        if (params->type == GGML_TASK_FINALIZE) {
-            return;
-        }
-
-        for (int64_t i03 = 0; i03 < ne03; i03++) {
-            for (int64_t i02 = 0; i02 < ne02; i02++) {
-                float * const wdata = params->wdata;
-                {
-                    size_t id = 0;
-                    for (int64_t i01 = 0; i01 < ne01; ++i01) {
-                        for (int64_t i00 = 0; i00 < ne00; ++i00) {
-                            wdata[id++] = GGML_FP16_TO_FP32(*(ggml_fp16_t *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01 + i00*nb00));
-                        }
-                    }
-
-                    assert(id*sizeof(float) <= params->wsize);
-                }
-
-                const float * x = wdata;
-                const float * y = (float *) ((char *) src1->data + i02*nb12 + i03*nb13);
-
-                float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
-
-                // zT = y * xT
-                cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
-                        ne11, ne01, ne10,
-                        1.0f,    y, ne10,
-                                 x, ne00,
-                        0.0f,    d, ne01);
-            }
-        }
-
-        /*printf("CBLAS F16 = %f ms, %d x %d x %d x %d\n", (ggml_perf_time_us() - t0)/1000.0, ne0, ne1, ne2, ne3);*/
-
-        return;
-    }
-#endif
-
-    if (params->type == GGML_TASK_INIT) {
-        ggml_fp16_t * const wdata = params->wdata;
-
-        size_t id = 0;
-        for (int64_t i13 = 0; i13 < ne13; ++i13) {
-            for (int64_t i12 = 0; i12 < ne12; ++i12) {
-                for (int64_t i11 = 0; i11 < ne11; ++i11) {
-                    for (int64_t i10 = 0; i10 < ne10; ++i10) {
-                        wdata[id++] = GGML_FP32_TO_FP16(*(float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11 + i10*nb10));
-                    }
-                }
-            }
-        }
-
-        GGML_ASSERT(id*sizeof(ggml_fp16_t) <= params->wsize);
-
-        return;
-    }
-
-    if (params->type == GGML_TASK_FINALIZE) {
-        return;
-    }
-
-    // fp16 -> half the size, so divide by 2
-    // TODO: do not support transposed src1
-    assert(nb10/2 == sizeof(ggml_fp16_t));
-
-    // parallelize by src0 rows using ggml_vec_dot_f16
-
-    // total rows in src0
-    const int nr = ne01*ne02*ne03;
-
-    // rows per thread
-    const int dr = (nr + nth - 1)/nth;
-
-    // row range for this thread
-    const int ir0 = dr*ith;
-    const int ir1 = MIN(ir0 + dr, nr);
-
-    ggml_fp16_t * wdata = params->wdata;
-
-    for (int ir = ir0; ir < ir1; ++ir) {
-        // src0 indices
-        const int i03 = ir/(ne02*ne01);
-        const int i02 = (ir - i03*ne02*ne01)/ne01;
-        const int i01 = (ir - i03*ne02*ne01 - i02*ne01);
-
-        const int i13 = i03;
-        const int i12 = i02;
-
-        const int i0 = i01;
-        const int i2 = i02;
-        const int i3 = i03;
-
-        ggml_fp16_t * src0_row = (ggml_fp16_t *) ((char *) src0->data + (i01*nb01 + i02*nb02 + i03*nb03));
-        ggml_fp16_t * src1_col =                                wdata + (       0 + i12*ne11 + i13*ne12*ne11)*ne00;
-
-        float * dst_col = (float *) ((char *) dst->data + (i0*nb0 + 0*nb1 + i2*nb2 + i3*nb3));
-
-        for (int64_t ic = 0; ic < ne11; ++ic) {
-            ggml_vec_dot_f16(ne00, &dst_col[ic*ne0], src0_row, src1_col + ic*ne00);
-        }
-    }
-
-    //int64_t t1 = ggml_time_us();
-    //static int64_t acc = 0;
-    //acc += t1 - t0;
-    //if (t1 - t0 > 10) {
-    //    printf("\n");
-    //    printf("ne00 = %5d, ne01 = %5d, ne02 = %5d, ne03 = %5d\n", ne00, ne01, ne02, ne03);
-    //    printf("nb00 = %5d, nb01 = %5d, nb02 = %5d, nb03 = %5d\n", nb00, nb01, nb02, nb03);
-    //    printf("ne10 = %5d, ne11 = %5d, ne12 = %5d, ne13 = %5d\n", ne10, ne11, ne12, ne13);
-
-    //    printf("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX task %d/%d: %d us, acc = %d\n", ith, nth, (int) (t1 - t0), (int) acc);
-    //}
-}
-
-static void ggml_compute_forward_mul_mat_q_f32(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        const struct ggml_tensor * src1,
-              struct ggml_tensor * dst) {
-    int64_t t0 = ggml_perf_time_us();
-    UNUSED(t0);
-
-    const int64_t ne00 = src0->ne[0];
-    const int64_t ne01 = src0->ne[1];
-    const int64_t ne02 = src0->ne[2];
-    const int64_t ne03 = src0->ne[3];
-
-    const int64_t ne10 = src1->ne[0];
-    const int64_t ne11 = src1->ne[1];
-    const int64_t ne12 = src1->ne[2];
-    const int64_t ne13 = src1->ne[3];
-
-    const int64_t ne0  = dst->ne[0];
-    const int64_t ne1  = dst->ne[1];
-    const int64_t ne2  = dst->ne[2];
-    const int64_t ne3  = dst->ne[3];
-
-    const int nb00 = src0->nb[0];
-    const int nb01 = src0->nb[1];
-    const int nb02 = src0->nb[2];
-    const int nb03 = src0->nb[3];
-
-    const int nb10 = src1->nb[0];
-    const int nb11 = src1->nb[1];
-    const int nb12 = src1->nb[2];
-    const int nb13 = src1->nb[3];
-
-    const int nb0  = dst->nb[0];
-    const int nb1  = dst->nb[1];
-    const int nb2  = dst->nb[2];
-    const int nb3  = dst->nb[3];
-
-    const int ith = params->ith;
-    const int nth = params->nth;
-
-    GGML_ASSERT(ne02 == ne12);
-    GGML_ASSERT(ne03 == ne13);
-    GGML_ASSERT(ne2  == ne12);
-    GGML_ASSERT(ne3  == ne13);
-
-    const enum ggml_type type = src0->type;
-    quantize_row_q_t const quantize_row_q_dot = quantize_fns[type].quantize_row_q_dot;
-    vec_dot_q_t      const vec_dot_q          = quantize_fns[type].vec_dot_q;
-    enum ggml_type   const vec_dot_type       = quantize_fns[type].vec_dot_type;
+    GGML_ASSERT(ne2 == ne12);
+    GGML_ASSERT(ne3 == ne13);
 
     // we don't support permuted src0 or src1
-    GGML_ASSERT(nb00 == (int) GGML_TYPE_SIZE[type]);
+    GGML_ASSERT(nb00 == GGML_TYPE_SIZE[type]);
     GGML_ASSERT(nb10 == sizeof(float));
 
     // dst cannot be transposed or permuted
@@ -11029,16 +10727,16 @@ static void ggml_compute_forward_mul_mat_q_f32(
     GGML_ASSERT(nb1 <= nb2);
     GGML_ASSERT(nb2 <= nb3);
 
-    GGML_ASSERT(ne0 == ne01);
-    GGML_ASSERT(ne1 == ne11);
-    GGML_ASSERT(ne2 == ne02);
-    GGML_ASSERT(ne3 == ne03);
-
     // nb01 >= nb00 - src0 is not transposed
     //   compute by src0 rows
 
 #if defined(GGML_USE_CLBLAST)
     if (ggml_cl_can_mul_mat(src0, src1, dst)) {
+        // TODO: handle case when src0 is broadcast-able into src1 across 2nd,3rd dimension
+        //       ref: https://github.com/ggerganov/ggml/pull/224
+        GGML_ASSERT(ne02 == ne12);
+        GGML_ASSERT(ne03 == ne13);
+
         if (params->ith == 0 && params->type == GGML_TASK_COMPUTE) {
             ggml_cl_mul_mat(src0, src1, dst, params->wdata, params->wsize);
         }
@@ -11055,6 +10753,11 @@ static void ggml_compute_forward_mul_mat_q_f32(
 
 #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
     if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) {
+        // TODO: handle case when src0 is broadcast-able into src1 across 2nd,3rd dimension
+        //       ref: https://github.com/ggerganov/ggml/pull/224
+        GGML_ASSERT(ne02 == ne12);
+        GGML_ASSERT(ne03 == ne13);
+
         if (params->ith != 0) {
             return;
         }
@@ -11067,27 +10770,27 @@ static void ggml_compute_forward_mul_mat_q_f32(
             return;
         }
 
-        float * const wdata = params->wdata;
-        dequantize_row_q_t const dequantize_row_q = quantize_fns[type].dequantize_row_q;
-
         for (int64_t i03 = 0; i03 < ne03; i03++) {
             for (int64_t i02 = 0; i02 < ne02; i02++) {
+                const void * x = (char *) src0->data + i03*nb03 + i02*nb02;
                 const float * y = (float *) ((char *) src1->data + i02*nb12 + i03*nb13);
 
                 float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
 
-                {
+                if (type != GGML_TYPE_F32) {
+                            float * const wdata    = params->wdata;
+                    ggml_to_float_t const to_float = type_traits[type].to_float;
+
                     size_t id = 0;
                     for (int64_t i01 = 0; i01 < ne01; ++i01) {
-                        dequantize_row_q((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01, wdata + id, ne00);
+                        to_float((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01, wdata + id, ne00);
                         id += ne00;
                     }
 
                     assert(id*sizeof(float) <= params->wsize);
+                    x = wdata;
                 }
 
-                const float * x = wdata;
-
                 cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
                         ne11, ne01, ne10,
                         1.0f,    y, ne10,
@@ -11103,14 +10806,16 @@ static void ggml_compute_forward_mul_mat_q_f32(
 #endif
 
     if (params->type == GGML_TASK_INIT) {
-        char * wdata = params->wdata;
-        const size_t row_size = ne10*GGML_TYPE_SIZE[vec_dot_type]/GGML_BLCK_SIZE[vec_dot_type];
+        if (src1->type != vec_dot_type) {
+            char * wdata = params->wdata;
+            const size_t row_size = ne10*GGML_TYPE_SIZE[vec_dot_type]/GGML_BLCK_SIZE[vec_dot_type];
 
-        for (int64_t i13 = 0; i13 < ne13; ++i13) {
-            for (int64_t i12 = 0; i12 < ne12; ++i12) {
-                for (int64_t i11 = 0; i11 < ne11; ++i11) {
-                    quantize_row_q_dot((float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11), (void *) wdata, ne10);
-                    wdata += row_size;
+            for (int64_t i13 = 0; i13 < ne13; ++i13) {
+                for (int64_t i12 = 0; i12 < ne12; ++i12) {
+                    for (int64_t i11 = 0; i11 < ne11; ++i11) {
+                        from_float_to_vec_dot((float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11), (void *) wdata, ne10);
+                        wdata += row_size;
+                    }
                 }
             }
         }
@@ -11122,43 +10827,52 @@ static void ggml_compute_forward_mul_mat_q_f32(
         return;
     }
 
-    // parallelize by src0 rows using ggml_vec_dot_q
+    // parallelize by src0 rows
+    const int64_t dr = (ne01 + nth - 1)/nth;
 
-    // total rows in src0
-    const int nr = ne01*ne02*ne03;
+    const int64_t ir10 = dr*ith;
+    const int64_t ir11 = MIN(ir10 + dr, ne01);
 
-    // rows per thread
-    const int dr = (nr + nth - 1)/nth;
+    // src1 rows
+    const int64_t nr1 = ne11*ne12*ne13;
 
-    // row range for this thread
-    const int ir0 = dr*ith;
-    const int ir1 = MIN(ir0 + dr, nr);
+    const void * wdata    = (src1->type == vec_dot_type) ? src1->data : params->wdata;
+    const size_t row_size = ne10*GGML_TYPE_SIZE[vec_dot_type]/GGML_BLCK_SIZE[vec_dot_type];
 
-    void * wdata = params->wdata;
-    const size_t row_size = ne00*GGML_TYPE_SIZE[vec_dot_type]/GGML_BLCK_SIZE[vec_dot_type];
+    for (int64_t ir1 = 0; ir1 < nr1; ++ir1) {
+        const int64_t i13 = (ir1/(ne12*ne11));
+        const int64_t i12 = (ir1 - i13*ne12*ne11)/ne11;
+        const int64_t i11 = (ir1 - i13*ne12*ne11 - i12*ne11);
 
-    for (int ir = ir0; ir < ir1; ++ir) {
-        // src0 indices
-        const int i03 = ir/(ne02*ne01);
-        const int i02 = (ir - i03*ne02*ne01)/ne01;
-        const int i01 = (ir - i03*ne02*ne01 - i02*ne01);
+        const int64_t ir0 = (ir1/ne11)%(ne02*ne03);
+        const int64_t i03 = (ir0/(ne02));
+        // Hack for "Falcon multi-query-attention key stutter" / alternative to ggml_repeat2.
+        // See https://github.com/ggerganov/llama.cpp/issues/1602#issuecomment-1606087470:
+        // GG: this is likely the correct way to broadcast, though need some more thought
+        //     therefore leaving the comments to remind us for now
+        const int64_t i02 = (i12 / (ne12 / ne02));
+        // Original from PR/224 (and also essential/correct for non-broadcast matmuls in Falcon)
+        // const int64_t i02 = (ir0 - i03*ne02);
 
-        const int i13 = i03;
-        const int i12 = i02;
+        const int64_t i1 = i11;
+        const int64_t i2 = i12;
+        const int64_t i3 = i13;
 
-        const int i0 = i01;
-        const int i2 = i02;
-        const int i3 = i03;
+        const char * src0_row = (const char *) src0->data + (  0 + i02*nb02 + i03*nb03     );
 
-        void * src0_row = (void *) ((char *) src0->data + (i01*nb01 + i02*nb02 + i03*nb03));
-        char * src1_col =          ((char *)      wdata + (      (0 + i12*ne11 + i13*ne12*ne11)*row_size));
+        // desc: when src1 is not a contiguous memory block we have to calculate the offset using the strides
+        //       if it is, then we have either copied the data to params->wdata and made it contiguous or we are using
+        //       the original src1 data pointer, so we should index using the indices directly
+        // TODO: this is a bit of a hack, we should probably have a better way to handle this
+        const char * src1_col = (const char *) wdata +
+            (src1_cont || src1->type != vec_dot_type
+             ? (i11      + i12*ne11 + i13*ne12*ne11)*row_size
+             : (i11*nb11 + i12*nb12 + i13*nb13));
 
-        float * dst_col = (float *) ((char *) dst->data + (i0*nb0 + 0*nb1 + i2*nb2 + i3*nb3));
+        float * dst_col = (float *) ((char *) dst->data + (i1*nb1 + i2*nb2 + i3*nb3));
 
-        assert(ne00 % 32 == 0);
-
-        for (int64_t ic = 0; ic < ne11; ++ic) {
-            vec_dot_q(ne00, &dst_col[ic*ne0], src0_row, (void *) (src1_col + ic*row_size));
+        for (int64_t ir = ir10; ir < ir11; ++ir) {
+            vec_dot(ne00, &dst_col[ir], src0_row + ir*nb01, src1_col);
         }
     }
 
@@ -11175,40 +10889,6 @@ static void ggml_compute_forward_mul_mat_q_f32(
     //}
 }
 
-static void ggml_compute_forward_mul_mat(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        const struct ggml_tensor * src1,
-        struct ggml_tensor * dst) {
-    switch (src0->type) {
-        case GGML_TYPE_Q4_0:
-        case GGML_TYPE_Q4_1:
-        case GGML_TYPE_Q5_0:
-        case GGML_TYPE_Q5_1:
-        case GGML_TYPE_Q8_0:
-        case GGML_TYPE_Q8_1:
-        case GGML_TYPE_Q2_K:
-        case GGML_TYPE_Q3_K:
-        case GGML_TYPE_Q4_K:
-        case GGML_TYPE_Q5_K:
-        case GGML_TYPE_Q6_K:
-            {
-                ggml_compute_forward_mul_mat_q_f32(params, src0, src1, dst);
-            } break;
-        case GGML_TYPE_F16:
-            {
-                ggml_compute_forward_mul_mat_f16_f32(params, src0, src1, dst);
-            } break;
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_mul_mat_f32(params, src0, src1, dst);
-            } break;
-        default:
-            {
-                GGML_ASSERT(false);
-            } break;
-    }
-}
 
 // ggml_compute_forward_out_prod
 
@@ -11221,35 +10901,7 @@ static void ggml_compute_forward_out_prod_f32(
     int64_t t0 = ggml_perf_time_us();
     UNUSED(t0);
 
-    const int64_t ne00 = src0->ne[0];
-    const int64_t ne01 = src0->ne[1];
-    const int64_t ne02 = src0->ne[2];
-    const int64_t ne03 = src0->ne[3];
-
-    const int64_t ne10 = src1->ne[0];
-    //const int64_t ne11 = src1->ne[1];
-    const int64_t ne12 = src1->ne[2];
-    const int64_t ne13 = src1->ne[3];
-
-    const int64_t ne0  = dst->ne[0];
-    const int64_t ne1  = dst->ne[1];
-    const int64_t ne2  = dst->ne[2];
-    const int64_t ne3  = dst->ne[3];
-
-    const int nb00 = src0->nb[0];
-    const int nb01 = src0->nb[1];
-    const int nb02 = src0->nb[2];
-    const int nb03 = src0->nb[3];
-
-    const int nb10 = src1->nb[0];
-    const int nb11 = src1->nb[1];
-    const int nb12 = src1->nb[2];
-    const int nb13 = src1->nb[3];
-
-    const int nb0  = dst->nb[0];
-    const int nb1  = dst->nb[1];
-    const int nb2  = dst->nb[2];
-    const int nb3  = dst->nb[3];
+    GGML_TENSOR_BINARY_OP_LOCALS;
 
     const int ith = params->ith;
     const int nth = params->nth;
@@ -11484,15 +11136,8 @@ static void ggml_compute_forward_set_f32(
     const int nr = ggml_nrows(src1);
     const int nc = src1->ne[0];
 
-    const int64_t ne10 = src1->ne[0];
-    const int64_t ne11 = src1->ne[1];
-    const int64_t ne12 = src1->ne[2];
-    const int64_t ne13 = src1->ne[3];
-
-    const size_t nb10 = src1->nb[0];
-    const size_t nb11 = src1->nb[1];
-    const size_t nb12 = src1->nb[2];
-    const size_t nb13 = src1->nb[3];
+    GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne);
+    GGML_TENSOR_LOCALS(size_t,  nb1, src1, nb);
 
     // src0 and dst as viewed during set
     const size_t nb0 = ggml_element_size(src0);
@@ -11633,7 +11278,7 @@ static void ggml_compute_forward_get_rows_q(
     const int nc = src0->ne[0];
     const int nr = ggml_nelements(src1);
     const enum ggml_type type = src0->type;
-    dequantize_row_q_t const dequantize_row_q = quantize_fns[type].dequantize_row_q;
+    ggml_to_float_t const dequantize_row_q = type_traits[type].to_float;
 
     assert( dst->ne[0] == nc);
     assert( dst->ne[1] == nr);
@@ -11883,29 +11528,14 @@ static void ggml_compute_forward_diag_f32(
 
     // TODO: handle transposed/permuted matrices
 
-    const int ne00 = src0->ne[0];
-    const int ne01 = src0->ne[1];
-    const int ne02 = src0->ne[2];
-    const int ne03 = src0->ne[3];
-    const int ne0 = dst->ne[0];
-    const int ne1 = dst->ne[1];
-    const int ne2 = dst->ne[2];
-    const int ne3 = dst->ne[3];
+    GGML_TENSOR_UNARY_OP_LOCALS;
+
     GGML_ASSERT(ne00 == ne0);
     GGML_ASSERT(ne00 == ne1);
     GGML_ASSERT(ne01 == 1);
     GGML_ASSERT(ne02 == ne2);
     GGML_ASSERT(ne03 == ne3);
 
-    const int nb00 = src0->nb[0];
-    //const int nb01 = src0->nb[1];
-    const int nb02 = src0->nb[2];
-    const int nb03 = src0->nb[3];
-    const int nb0 = dst->nb[0];
-    const int nb1 = dst->nb[1];
-    const int nb2 = dst->nb[2];
-    const int nb3 = dst->nb[3];
-
     GGML_ASSERT(nb00 == sizeof(float));
     GGML_ASSERT(nb0  == sizeof(float));
 
@@ -12239,7 +11869,7 @@ static void ggml_compute_forward_alibi_f32(
 
     const int ne0 = src0->ne[0]; // all_seq_len = n_past + ne1
     const int ne1 = src0->ne[1]; // seq_len_without_past
-    //const int ne2 = src0->ne[2]; // n_head -> this is k
+    const int ne2 = src0->ne[2]; // n_head -> this is k
     //const int ne3 = src0->ne[3]; // 1 -> bsz
 
     const int n  = ggml_nrows(src0);
@@ -12250,8 +11880,9 @@ static void ggml_compute_forward_alibi_f32(
     const int nb2 = src0->nb[2];
     //const int nb3 = src0->nb[3];
 
-    assert(nb0 == sizeof(float));
-    assert(ne1 + n_past == ne0); (void) n_past;
+    GGML_ASSERT(nb0 == sizeof(float));
+    GGML_ASSERT(ne1 + n_past == ne0);
+    GGML_ASSERT(n_head == ne2);
 
     // add alibi to src0 (KQ_scaled)
     const int n_heads_log2_floor = 1 << (int) floor(log2(n_head));
@@ -12275,7 +11906,7 @@ static void ggml_compute_forward_alibi_f32(
                     m_k = powf(m1, 2 * (k - n_heads_log2_floor) + 1);
                 }
 
-                pdst[0] = (i-ne0+1) * m_k + src[0];
+                pdst[0] = i * m_k + src[0];
 
             }
         }
@@ -12304,7 +11935,7 @@ static void ggml_compute_forward_alibi_f16(
 
     const int ne0 = src0->ne[0]; // all_seq_len = n_past + ne1
     const int ne1 = src0->ne[1]; // seq_len_without_past
-    //const int ne2 = src0->ne[2]; // n_head -> this is k
+    const int ne2 = src0->ne[2]; // n_head -> this is k
     //const int ne3 = src0->ne[3]; // 1 -> bsz
 
     const int n  = ggml_nrows(src0);
@@ -12315,8 +11946,9 @@ static void ggml_compute_forward_alibi_f16(
     const int nb2 = src0->nb[2];
     //const int nb3 = src0->nb[3];
 
-    assert(nb0 == sizeof(ggml_fp16_t));
-    assert(ne1 + n_past == ne0); (void) n_past;
+    GGML_ASSERT(nb0 == sizeof(ggml_fp16_t));
+    GGML_ASSERT(ne1 + n_past == ne0); (void) n_past;
+    GGML_ASSERT(n_head == ne2);
 
     // add alibi to src0 (KQ_scaled)
     const int n_heads_log2_floor = 1 << (int) floor(log2(n_head));
@@ -12341,7 +11973,7 @@ static void ggml_compute_forward_alibi_f16(
                 }
 
                 // we return F32
-                pdst[0] = (i-ne0+1) * m_k + GGML_FP16_TO_FP32(src[0]);
+                pdst[0] = i * m_k + GGML_FP16_TO_FP32(src[0]);
             }
         }
     }
@@ -12469,33 +12101,25 @@ static void ggml_compute_forward_rope_f32(
         const struct ggml_tensor * src1,
         struct ggml_tensor * dst) {
     GGML_ASSERT(src1->type == GGML_TYPE_I32);
-    GGML_ASSERT(ggml_nelements(src1) == 4);
+    GGML_ASSERT(ggml_nelements(src1) == 6);
 
     if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
         return;
     }
 
+    float freq_base;
+    float freq_scale;
+
     const int n_past = ((int32_t *) src1->data)[0];
     const int n_dims = ((int32_t *) src1->data)[1];
     const int mode   = ((int32_t *) src1->data)[2];
     const int n_ctx  = ((int32_t *) src1->data)[3];
+    memcpy(&freq_base,  (int32_t *) src1->data + 4, sizeof(float));
+    memcpy(&freq_scale, (int32_t *) src1->data + 5, sizeof(float));
 
     assert(n_past >= 0);
 
-    const size_t nb00 = src0->nb[0];
-    const size_t nb01 = src0->nb[1];
-    const size_t nb02 = src0->nb[2];
-    const size_t nb03 = src0->nb[3];
-
-    const int64_t ne0 = dst->ne[0];
-    const int64_t ne1 = dst->ne[1];
-    const int64_t ne2 = dst->ne[2];
-    const int64_t ne3 = dst->ne[3];
-
-    const size_t nb0 = dst->nb[0];
-    const size_t nb1 = dst->nb[1];
-    const size_t nb2 = dst->nb[2];
-    const size_t nb3 = dst->nb[3];
+    GGML_TENSOR_UNARY_OP_LOCALS;
 
     //printf("ne0: %d, ne1: %d, ne2: %d, ne3: %d\n", ne0, ne1, ne2, ne3);
     //printf("n_past = %d, ne2 = %d\n", n_past, ne2);
@@ -12520,7 +12144,7 @@ static void ggml_compute_forward_rope_f32(
     // row index used to determine which thread to use
     int ir = 0;
 
-    const float theta_scale = powf(10000.0, -2.0f/n_dims);
+    const float theta_scale = powf(freq_base, -2.0f/n_dims);
 
     const bool is_neox = mode & 2;
     const bool is_glm  = mode & 4;
@@ -12532,7 +12156,7 @@ static void ggml_compute_forward_rope_f32(
                 if (ir++ < ir0) continue;
                 if (ir   > ir1) break;
 
-                float theta = (float)p;
+                float theta = freq_scale * (float)p;
 
                 if (is_glm) {
                     theta = MIN(p, n_ctx - 2);
@@ -12609,33 +12233,25 @@ static void ggml_compute_forward_rope_f16(
         const struct ggml_tensor * src1,
         struct ggml_tensor * dst) {
     GGML_ASSERT(src1->type == GGML_TYPE_I32);
-    GGML_ASSERT(ggml_nelements(src1) == 4);
+    GGML_ASSERT(ggml_nelements(src1) == 6);
 
     if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
         return;
     }
 
+    float freq_base;
+    float freq_scale;
+
     const int n_past = ((int32_t *) src1->data)[0];
     const int n_dims = ((int32_t *) src1->data)[1];
     const int mode   = ((int32_t *) src1->data)[2];
     const int n_ctx  = ((int32_t *) src1->data)[3];
+    memcpy(&freq_base,  (int32_t *) src1->data + 4, sizeof(float));
+    memcpy(&freq_scale, (int32_t *) src1->data + 5, sizeof(float));
 
     assert(n_past >= 0);
 
-    const size_t nb00 = src0->nb[0];
-    const size_t nb01 = src0->nb[1];
-    const size_t nb02 = src0->nb[2];
-    const size_t nb03 = src0->nb[3];
-
-    const int64_t ne0 = dst->ne[0];
-    const int64_t ne1 = dst->ne[1];
-    const int64_t ne2 = dst->ne[2];
-    const int64_t ne3 = dst->ne[3];
-
-    const size_t nb0 = dst->nb[0];
-    const size_t nb1 = dst->nb[1];
-    const size_t nb2 = dst->nb[2];
-    const size_t nb3 = dst->nb[3];
+    GGML_TENSOR_UNARY_OP_LOCALS;
 
     //printf("ne0: %d, ne1: %d, ne2: %d, ne3: %d\n", ne0, ne1, ne2, ne3);
     //printf("n_past = %d, ne2 = %d\n", n_past, ne2);
@@ -12660,7 +12276,7 @@ static void ggml_compute_forward_rope_f16(
     // row index used to determine which thread to use
     int ir = 0;
 
-    const float theta_scale = powf(10000.0, -2.0f/n_dims);
+    const float theta_scale = powf(freq_base, -2.0f/n_dims);
 
     const bool is_neox = mode & 2;
     const bool is_glm  = mode & 4;
@@ -12672,7 +12288,7 @@ static void ggml_compute_forward_rope_f16(
                 if (ir++ < ir0) continue;
                 if (ir   > ir1) break;
 
-                float theta = (float)p;
+                float theta = freq_scale * (float)p;
 
                 if (is_glm) {
                     theta = MIN(p, n_ctx - 2);
@@ -12733,7 +12349,7 @@ static void ggml_compute_forward_rope_f16(
                             const float x0 = GGML_FP16_TO_FP32(src[0]);
                             const float x1 = GGML_FP16_TO_FP32(src[n_dims/2]);
 
-                            dst_data[0]     = GGML_FP32_TO_FP16(x0*cos_theta - x1*sin_theta);
+                            dst_data[0]        = GGML_FP32_TO_FP16(x0*cos_theta - x1*sin_theta);
                             dst_data[n_dims/2] = GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta);
                         }
                     }
@@ -12788,21 +12404,7 @@ static void ggml_compute_forward_rope_back_f32(
 
     assert(n_past >= 0);
 
-    const size_t nb00 = src0->nb[0];
-    const size_t nb01 = src0->nb[1];
-    const size_t nb02 = src0->nb[2];
-    const size_t nb03 = src0->nb[3];
-
-    const int64_t ne0 = dst->ne[0];
-    const int64_t ne1 = dst->ne[1];
-    const int64_t ne2 = dst->ne[2];
-    const int64_t ne3 = dst->ne[3];
-
-    const size_t nb0 = dst->nb[0];
-    const size_t nb1 = dst->nb[1];
-    const size_t nb2 = dst->nb[2];
-    const size_t nb3 = dst->nb[3];
-
+    GGML_TENSOR_UNARY_OP_LOCALS;
 
     //printf("ne0: %d, ne1: %d, ne2: %d, ne3: %d\n", ne0, ne1, ne2, ne3);
     //printf("n_past = %d, ne2 = %d\n", n_past, ne2);
@@ -12901,21 +12503,7 @@ static void ggml_compute_forward_rope_back_f16(
 
     assert(n_past >= 0);
 
-    const size_t nb00 = src0->nb[0];
-    const size_t nb01 = src0->nb[1];
-    const size_t nb02 = src0->nb[2];
-    const size_t nb03 = src0->nb[3];
-
-    const int64_t ne0 = dst->ne[0];
-    const int64_t ne1 = dst->ne[1];
-    const int64_t ne2 = dst->ne[2];
-    const int64_t ne3 = dst->ne[3];
-
-    const size_t nb0 = dst->nb[0];
-    const size_t nb1 = dst->nb[1];
-    const size_t nb2 = dst->nb[2];
-    const size_t nb3 = dst->nb[3];
-
+    GGML_TENSOR_UNARY_OP_LOCALS;
 
     //printf("ne0: %d, ne1: %d, ne2: %d, ne3: %d\n", ne0, ne1, ne2, ne3);
     //printf("n_past = %d, ne2 = %d\n", n_past, ne2);
@@ -13013,7 +12601,7 @@ static void ggml_compute_forward_rope_back(
     }
 }
 
-// ggml_compute_forward_conv_1d_s1_ph
+// ggml_compute_forward_conv_1d
 
 static void ggml_compute_forward_conv_1d_s1_ph_f16_f32(
         const struct ggml_compute_params * params,
@@ -13027,36 +12615,7 @@ static void ggml_compute_forward_conv_1d_s1_ph_f16_f32(
     int64_t t0 = ggml_perf_time_us();
     UNUSED(t0);
 
-    const int64_t ne00 = src0->ne[0];
-    const int64_t ne01 = src0->ne[1];
-    const int64_t ne02 = src0->ne[2];
-    //const int64_t ne03 = src0->ne[3];
-
-    const int64_t ne10 = src1->ne[0];
-    const int64_t ne11 = src1->ne[1];
-    //const int64_t ne12 = src1->ne[2];
-    //const int64_t ne13 = src1->ne[3];
-
-    //const int64_t ne0  = dst->ne[0];
-    //const int64_t ne1  = dst->ne[1];
-    //const int64_t ne2  = dst->ne[2];
-    //const int64_t ne3  = dst->ne[3];
-    //const int64_t ne   = ne0*ne1*ne2*ne3;
-
-    const int nb00 = src0->nb[0];
-    const int nb01 = src0->nb[1];
-    const int nb02 = src0->nb[2];
-    //const int nb03 = src0->nb[3];
-
-    const int nb10 = src1->nb[0];
-    const int nb11 = src1->nb[1];
-    //const int nb12 = src1->nb[2];
-    //const int nb13 = src1->nb[3];
-
-    //const int nb0  = dst->nb[0];
-    const int nb1  = dst->nb[1];
-    //const int nb2  = dst->nb[2];
-    //const int nb3  = dst->nb[3];
+    GGML_TENSOR_BINARY_OP_LOCALS;
 
     const int ith = params->ith;
     const int nth = params->nth;
@@ -13147,36 +12706,7 @@ static void ggml_compute_forward_conv_1d_s1_ph_f32(
     int64_t t0 = ggml_perf_time_us();
     UNUSED(t0);
 
-    const int64_t ne00 = src0->ne[0];
-    const int64_t ne01 = src0->ne[1];
-    const int64_t ne02 = src0->ne[2];
-    //const int64_t ne03 = src0->ne[3];
-
-    const int64_t ne10 = src1->ne[0];
-    const int64_t ne11 = src1->ne[1];
-    //const int64_t ne12 = src1->ne[2];
-    //const int64_t ne13 = src1->ne[3];
-
-    //const int64_t ne0  = dst->ne[0];
-    //const int64_t ne1  = dst->ne[1];
-    //const int64_t ne2  = dst->ne[2];
-    //const int64_t ne3  = dst->ne[3];
-    //const int64_t ne   = ne0*ne1*ne2*ne3;
-
-    const int nb00 = src0->nb[0];
-    const int nb01 = src0->nb[1];
-    const int nb02 = src0->nb[2];
-    //const int nb03 = src0->nb[3];
-
-    const int nb10 = src1->nb[0];
-    const int nb11 = src1->nb[1];
-    //const int nb12 = src1->nb[2];
-    //const int nb13 = src1->nb[3];
-
-    //const int nb0  = dst->nb[0];
-    const int nb1  = dst->nb[1];
-    //const int nb2  = dst->nb[2];
-    //const int nb3  = dst->nb[3];
+    GGML_TENSOR_BINARY_OP_LOCALS;
 
     const int ith = params->ith;
     const int nth = params->nth;
@@ -13276,8 +12806,6 @@ static void ggml_compute_forward_conv_1d_s1_ph(
     }
 }
 
-// ggml_compute_forward_conv_1d_s2_ph
-
 static void ggml_compute_forward_conv_1d_s2_ph_f16_f32(
         const struct ggml_compute_params * params,
         const struct ggml_tensor * src0,
@@ -13290,36 +12818,7 @@ static void ggml_compute_forward_conv_1d_s2_ph_f16_f32(
     int64_t t0 = ggml_perf_time_us();
     UNUSED(t0);
 
-    const int64_t ne00 = src0->ne[0];
-    const int64_t ne01 = src0->ne[1];
-    const int64_t ne02 = src0->ne[2];
-    //const int64_t ne03 = src0->ne[3];
-
-    const int64_t ne10 = src1->ne[0];
-    const int64_t ne11 = src1->ne[1];
-    //const int64_t ne12 = src1->ne[2];
-    //const int64_t ne13 = src1->ne[3];
-
-    //const int64_t ne0  = dst->ne[0];
-    //const int64_t ne1  = dst->ne[1];
-    //const int64_t ne2  = dst->ne[2];
-    //const int64_t ne3  = dst->ne[3];
-    //const int64_t ne   = ne0*ne1*ne2*ne3;
-
-    const int nb00 = src0->nb[0];
-    const int nb01 = src0->nb[1];
-    const int nb02 = src0->nb[2];
-    //const int nb03 = src0->nb[3];
-
-    const int nb10 = src1->nb[0];
-    const int nb11 = src1->nb[1];
-    //const int nb12 = src1->nb[2];
-    //const int nb13 = src1->nb[3];
-
-    //const int nb0  = dst->nb[0];
-    const int nb1  = dst->nb[1];
-    //const int nb2  = dst->nb[2];
-    //const int nb3  = dst->nb[3];
+    GGML_TENSOR_BINARY_OP_LOCALS;
 
     const int ith = params->ith;
     const int nth = params->nth;
@@ -13410,36 +12909,7 @@ static void ggml_compute_forward_conv_1d_s2_ph_f32(
     int64_t t0 = ggml_perf_time_us();
     UNUSED(t0);
 
-    const int64_t ne00 = src0->ne[0];
-    const int64_t ne01 = src0->ne[1];
-    const int64_t ne02 = src0->ne[2];
-    //const int64_t ne03 = src0->ne[3];
-
-    const int64_t ne10 = src1->ne[0];
-    const int64_t ne11 = src1->ne[1];
-    //const int64_t ne12 = src1->ne[2];
-    //const int64_t ne13 = src1->ne[3];
-
-    //const int64_t ne0  = dst->ne[0];
-    //const int64_t ne1  = dst->ne[1];
-    //const int64_t ne2  = dst->ne[2];
-    //const int64_t ne3  = dst->ne[3];
-    //const int64_t ne   = ne0*ne1*ne2*ne3;
-
-    const int nb00 = src0->nb[0];
-    const int nb01 = src0->nb[1];
-    const int nb02 = src0->nb[2];
-    //const int nb03 = src0->nb[3];
-
-    const int nb10 = src1->nb[0];
-    const int nb11 = src1->nb[1];
-    //const int nb12 = src1->nb[2];
-    //const int nb13 = src1->nb[3];
-
-    //const int nb0  = dst->nb[0];
-    const int nb1  = dst->nb[1];
-    //const int nb2  = dst->nb[2];
-    //const int nb3  = dst->nb[3];
+    GGML_TENSOR_BINARY_OP_LOCALS;
 
     const int ith = params->ith;
     const int nth = params->nth;
@@ -13539,12 +13009,35 @@ static void ggml_compute_forward_conv_1d_s2_ph(
     }
 }
 
-// ggml_compute_forward_conv_2d_sk_p0
+// ggml_compute_forward_conv_1d
 
-static void ggml_compute_forward_conv_2d_sk_p0_f16_f32(
+static void ggml_compute_forward_conv_1d(
+    const struct ggml_compute_params * params,
+    const struct ggml_tensor * src0,
+    const struct ggml_tensor * src1,
+    const struct ggml_tensor * opt0,
+    struct ggml_tensor * dst) {
+    const int32_t s0 = ((const int32_t*)(opt0->data))[0];
+    const int32_t p0 = ((const int32_t*)(opt0->data))[1];
+    const int32_t d0 = ((const int32_t*)(opt0->data))[2];
+    GGML_ASSERT(d0 == 1); // dilation not supported
+    GGML_ASSERT(p0 == src0->ne[0]/2); // only half padding supported
+    if (s0 == 1) {
+        ggml_compute_forward_conv_1d_s1_ph(params, src0, src1, dst);
+    } else if (s0 == 2) {
+        ggml_compute_forward_conv_1d_s2_ph(params, src0, src1, dst);
+    } else {
+        GGML_ASSERT(false); // only stride 1 and 2 supported
+    };
+}
+
+// ggml_compute_forward_conv_2d
+
+static void ggml_compute_forward_conv_2d_f16_f32(
         const struct ggml_compute_params * params,
         const struct ggml_tensor * src0,
         const struct ggml_tensor * src1,
+        const struct ggml_tensor * opt0,
               struct ggml_tensor * dst) {
     GGML_ASSERT(src0->type == GGML_TYPE_F16);
     GGML_ASSERT(src1->type == GGML_TYPE_F32);
@@ -13553,36 +13046,7 @@ static void ggml_compute_forward_conv_2d_sk_p0_f16_f32(
     int64_t t0 = ggml_perf_time_us();
     UNUSED(t0);
 
-    const int ne00 = src0->ne[0];
-    const int ne01 = src0->ne[1];
-    const int ne02 = src0->ne[2];
-    //const int ne03 = src0->ne[3];
-
-    const int ne10 = src1->ne[0];
-    //const int ne11 = src1->ne[1];
-    const int ne12 = src1->ne[2];
-    //const int ne13 = src1->ne[3];
-
-    const int ne0  = dst->ne[0];
-    const int ne1  = dst->ne[1];
-    const int ne2  = dst->ne[2];
-    //const int ne3  = dst->ne[3];
-    //const int ne   = ne0*ne1*ne2*ne3;
-
-    const int nb00 = src0->nb[0];
-    //const int nb01 = src0->nb[1];
-    //const int nb02 = src0->nb[2];
-    const int nb03 = src0->nb[3];
-
-    const int nb10 = src1->nb[0];
-    //const int nb11 = src1->nb[1];
-    const int nb12 = src1->nb[2];
-    //const int nb13 = src1->nb[3];
-
-    //const int nb0  = dst->nb[0];
-    //const int nb1  = dst->nb[1];
-    const int nb2  = dst->nb[2];
-    //const int nb3  = dst->nb[3];
+    GGML_TENSOR_BINARY_OP_LOCALS;
 
     const int ith = params->ith;
     const int nth = params->nth;
@@ -13593,11 +13057,17 @@ static void ggml_compute_forward_conv_2d_sk_p0_f16_f32(
     // size of the convolution row - the kernel size unrolled across all channels
     const int ew0 = nk0*nk1*ne02;
 
+    const int32_t s0 = ((const int32_t*)(opt0->data))[0];
+    const int32_t s1 = ((const int32_t*)(opt0->data))[1];
+    const int32_t p0 = ((const int32_t*)(opt0->data))[2];
+    const int32_t p1 = ((const int32_t*)(opt0->data))[3];
+    const int32_t d0 = ((const int32_t*)(opt0->data))[4];
+    const int32_t d1 = ((const int32_t*)(opt0->data))[5];
+
     GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
     GGML_ASSERT(nb10 == sizeof(float));
 
     if (params->type == GGML_TASK_INIT) {
-        // TODO: fix this memset (wsize is overestimated)
         memset(params->wdata, 0, params->wsize);
 
         // prepare source data (src1)
@@ -13612,8 +13082,13 @@ static void ggml_compute_forward_conv_2d_sk_p0_f16_f32(
                     for (int i0 = 0; i0 < ne0; i0++) {
                         for (int ik1 = 0; ik1 < nk1; ik1++) {
                             for (int ik0 = 0; ik0 < nk0; ik0++) {
-                                dst_data[(i1*ne0 + i0)*ew0 + i12*(nk0*nk1) + ik1*nk0 + ik0] =
-                                    GGML_FP32_TO_FP16(src[(i1*nk1 + ik1)*ne10 + (i0*nk0 + ik0)]);
+                                const int idx0 = i0*s0 + ik0*d0 - p0;
+                                const int idx1 = i1*s1 + ik1*d1 - p1;
+
+                                if (!(idx1 < 0 || idx1 >= ne11 || idx0 < 0 || idx0 >= ne10)) {
+                                    dst_data[(i1*ne0 + i0)*ew0 + i12*(nk0*nk1) + ik1*nk0 + ik0] =
+                                        GGML_FP32_TO_FP16(src[idx1*ne10 + idx0]);
+                                }
                             }
                         }
                     }
@@ -13640,32 +13115,36 @@ static void ggml_compute_forward_conv_2d_sk_p0_f16_f32(
 
     ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0;
 
-    for (int i2 = ip0; i2 < ip1; i2++) {
-        float * dst_data = (float *)((char *) dst->data + i2*nb2);
+    for (int i3 = 0; i3 < ne3; i3++) {
+        for (int i2 = ip0; i2 < ip1; i2++) {
+            float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2);
 
-        for (int i1 = 0; i1 < ne1; ++i1) {
-            for (int i0 = 0; i0 < ne0; ++i0) {
-                ggml_vec_dot_f16(ew0, dst_data + i1*ne0 + i0,
-                        (ggml_fp16_t *) ((char *) src0->data + i2*nb03),
-                        (ggml_fp16_t *)                wdata + (i1*ne0 + i0)*ew0);
+            for (int i1 = 0; i1 < ne1; ++i1) {
+                for (int i0 = 0; i0 < ne0; ++i0) {
+                    ggml_vec_dot_f16(ew0, dst_data + i1*ne0 + i0,
+                            (ggml_fp16_t *) ((char *) src0->data + i2*nb03),
+                            (ggml_fp16_t *)                wdata + i3*nb3 + (i1*ne0 + i0)*ew0);
+                }
             }
         }
     }
 }
 
-static void ggml_compute_forward_conv_2d_sk_p0(
+static void ggml_compute_forward_conv_2d(
         const struct ggml_compute_params * params,
         const struct ggml_tensor * src0,
         const struct ggml_tensor * src1,
-        struct ggml_tensor * dst) {
+        const struct ggml_tensor * opt0,
+        struct ggml_tensor * dst
+        ) {
     switch (src0->type) {
         case GGML_TYPE_F16:
             {
-                ggml_compute_forward_conv_2d_sk_p0_f16_f32(params, src0, src1, dst);
+                ggml_compute_forward_conv_2d_f16_f32(params, src0, src1, opt0, dst);
             } break;
         case GGML_TYPE_F32:
             {
-                //ggml_compute_forward_conv_2d_sk_p0_f32(params, src0, src1, dst);
+                //ggml_compute_forward_conv_2d_f32(params, src0, src1, opt0, dst);
                 GGML_ASSERT(false);
             } break;
         default:
@@ -13675,6 +13154,167 @@ static void ggml_compute_forward_conv_2d_sk_p0(
     }
 }
 
+// ggml_compute_forward_pool_1d_sk_p0
+
+static void ggml_compute_forward_pool_1d_sk_p0(
+        const struct ggml_compute_params * params,
+        const enum ggml_op_pool op,
+        const struct ggml_tensor * src,
+        const int k,
+        struct ggml_tensor * dst) {
+    assert(src->type == GGML_TYPE_F32);
+    assert(params->ith == 0);
+
+    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
+        return;
+    }
+
+    const char * cdata = (const char *)src->data;
+    const char * const data_end = cdata + ggml_nbytes(src);
+    float * drow = (float *)dst->data;
+
+    const int64_t rs = dst->ne[0];
+
+    while (cdata < data_end) {
+        const float * const srow = (const float *)cdata;
+
+        int j = 0;
+
+        for (int64_t i = 0; i < rs; ++i) {
+            switch (op) {
+                case GGML_OP_POOL_AVG:   drow[i] = 0;        break;
+                case GGML_OP_POOL_MAX:   drow[i] = -FLT_MAX; break;
+                case GGML_OP_POOL_COUNT: GGML_ASSERT(false); break;
+            }
+            for (int ki = 0; ki < k; ++ki) {
+                switch (op) {
+                    case GGML_OP_POOL_AVG:                          drow[i] += srow[j]; break;
+                    case GGML_OP_POOL_MAX:   if (srow[j] > drow[i]) drow[i]  = srow[j]; break;
+                    case GGML_OP_POOL_COUNT:                        GGML_ASSERT(false); break;
+                }
+                ++j;
+            }
+            switch (op) {
+                case GGML_OP_POOL_AVG:         drow[i] /= k; break;
+                case GGML_OP_POOL_MAX:                       break;
+                case GGML_OP_POOL_COUNT: GGML_ASSERT(false); break;
+            }
+        }
+
+        cdata += src->nb[1];
+        drow  += rs;
+    }
+}
+
+// ggml_compute_forward_pool_1d
+
+static void ggml_compute_forward_pool_1d(
+    const struct ggml_compute_params* params,
+    const struct ggml_tensor* src0,
+    const struct ggml_tensor* opt0,
+    struct ggml_tensor* dst) {
+    GGML_ASSERT(opt0->ne[0] == 4);
+    const int* opts = (const int*)opt0->data;
+    enum ggml_op_pool op = opts[0];
+    const int k0 = opts[1];
+    const int s0 = opts[2];
+    const int p0 = opts[3];
+    GGML_ASSERT(p0 == 0); // padding not supported
+    GGML_ASSERT(k0 == s0); // only s = k supported
+
+    ggml_compute_forward_pool_1d_sk_p0(params, op, src0, k0, dst);
+}
+
+// ggml_compute_forward_pool_2d_sk_p0
+
+static void ggml_compute_forward_pool_2d_sk_p0(
+    const struct ggml_compute_params * params,
+    const enum ggml_op_pool op,
+    const struct ggml_tensor * src,
+    const int k0,
+    const int k1,
+    struct ggml_tensor * dst) {
+    assert(src->type == GGML_TYPE_F32);
+    assert(params->ith == 0);
+
+    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
+        return;
+    }
+
+    const char * cdata = (const char*)src->data;
+    const char * const data_end = cdata + ggml_nbytes(src);
+
+    const int64_t px = dst->ne[0];
+    const int64_t py = dst->ne[1];
+    const int64_t pa = px * py;
+
+    float * dplane = (float *)dst->data;
+
+    const int ka = k0 * k1;
+
+    while (cdata < data_end) {
+        for (int oy = 0; oy < py; ++oy) {
+            float * const drow = dplane + oy * px;
+            for (int ox = 0; ox < px; ++ox) {
+                float * const out =  drow + ox;
+                switch (op) {
+                    case GGML_OP_POOL_AVG:     *out = 0;        break;
+                    case GGML_OP_POOL_MAX:     *out = -FLT_MAX; break;
+                    case GGML_OP_POOL_COUNT: GGML_ASSERT(false); break;
+                }
+
+                const int ix = ox * k0;
+                const int iy = oy * k1;
+
+                for (int ky = 0; ky < k1; ++ky) {
+                    const float * const srow = (const float *)(cdata + src->nb[1] * (iy + ky));
+                    for (int kx = 0; kx < k0; ++kx) {
+                        int j = ix + kx;
+                        switch (op) {
+                            case GGML_OP_POOL_AVG:                     *out += srow[j]; break;
+                            case GGML_OP_POOL_MAX: if (srow[j] > *out) *out  = srow[j]; break;
+                            case GGML_OP_POOL_COUNT:                GGML_ASSERT(false); break;
+                        }
+                    }
+                }
+                switch (op) {
+                    case GGML_OP_POOL_AVG:           *out /= ka; break;
+                    case GGML_OP_POOL_MAX:                       break;
+                    case GGML_OP_POOL_COUNT: GGML_ASSERT(false); break;
+                }
+            }
+        }
+
+        cdata  += src->nb[2];
+        dplane += pa;
+    }
+}
+
+// ggml_compute_forward_pool_2d
+
+static void ggml_compute_forward_pool_2d(
+    const struct ggml_compute_params * params,
+    const struct ggml_tensor * src0,
+    const struct ggml_tensor * opt0,
+    struct ggml_tensor * dst) {
+    GGML_ASSERT(opt0->ne[0] == 7);
+    const int* opts = (const int*)opt0->data;
+    enum ggml_op_pool op = opts[0];
+    const int k0 = opts[1];
+    const int k1 = opts[2];
+    const int s0 = opts[3];
+    const int s1 = opts[4];
+    const int p0 = opts[5];
+    const int p1 = opts[6];
+    GGML_ASSERT(p0 == 0);
+    GGML_ASSERT(p1 == 0); // padding not supported
+    GGML_ASSERT(k0 == s0);
+    GGML_ASSERT(k1 == s1); // only s = k supported
+
+    ggml_compute_forward_pool_2d_sk_p0(params, op, src0, k0, k1, dst);
+}
+
+
 // ggml_compute_forward_flash_attn
 
 static void ggml_compute_forward_flash_attn_f32(
@@ -13687,45 +13327,14 @@ static void ggml_compute_forward_flash_attn_f32(
     int64_t t0 = ggml_perf_time_us();
     UNUSED(t0);
 
-    const int64_t neq0 = q->ne[0];
-    const int64_t neq1 = q->ne[1];
-    const int64_t neq2 = q->ne[2];
-    const int64_t neq3 = q->ne[3];
-
-    const int64_t nek0 = k->ne[0];
-    const int64_t nek1 = k->ne[1];
-    //const int64_t nek2 = k->ne[2];
-    //const int64_t nek3 = k->ne[3];
-
-    //const int64_t nev0 = v->ne[0];
-    const int64_t nev1 = v->ne[1];
-    //const int64_t nev2 = v->ne[2];
-    //const int64_t nev3 = v->ne[3];
-
-    const int64_t ne0  = dst->ne[0];
-    const int64_t ne1  = dst->ne[1];
-    //const int64_t ne2  = dst->ne[2];
-    //const int64_t ne3  = dst->ne[3];
-
-    const int nbk0 = k->nb[0];
-    const int nbk1 = k->nb[1];
-    const int nbk2 = k->nb[2];
-    const int nbk3 = k->nb[3];
-
-    const int nbq0 = q->nb[0];
-    const int nbq1 = q->nb[1];
-    const int nbq2 = q->nb[2];
-    const int nbq3 = q->nb[3];
-
-    const int nbv0 = v->nb[0];
-    const int nbv1 = v->nb[1];
-    const int nbv2 = v->nb[2];
-    const int nbv3 = v->nb[3];
-
-    const int nb0  = dst->nb[0];
-    const int nb1  = dst->nb[1];
-    const int nb2  = dst->nb[2];
-    const int nb3  = dst->nb[3];
+    GGML_TENSOR_LOCALS(int64_t, neq, q,   ne);
+    GGML_TENSOR_LOCALS(size_t,  nbq, q,   nb);
+    GGML_TENSOR_LOCALS(int64_t, nek, k,   ne);
+    GGML_TENSOR_LOCALS(size_t,  nbk, k,   nb);
+    GGML_TENSOR_LOCALS(int64_t, nev, v,   ne);
+    GGML_TENSOR_LOCALS(size_t,  nbv, v,   nb);
+    GGML_TENSOR_LOCALS(int64_t, ne,  dst, ne);
+    GGML_TENSOR_LOCALS(size_t,  nb,  dst, nb);
 
     const int ith = params->ith;
     const int nth = params->nth;
@@ -13896,45 +13505,14 @@ static void ggml_compute_forward_flash_attn_f16(
     int64_t t0 = ggml_perf_time_us();
     UNUSED(t0);
 
-    const int64_t neq0 = q->ne[0];
-    const int64_t neq1 = q->ne[1];
-    const int64_t neq2 = q->ne[2];
-    const int64_t neq3 = q->ne[3];
-
-    const int64_t nek0 = k->ne[0];
-    const int64_t nek1 = k->ne[1];
-    //const int64_t nek2 = k->ne[2];
-    //const int64_t nek3 = k->ne[3];
-
-    //const int64_t nev0 = v->ne[0];
-    const int64_t nev1 = v->ne[1];
-    //const int64_t nev2 = v->ne[2];
-    //const int64_t nev3 = v->ne[3];
-
-    const int64_t ne0  = dst->ne[0];
-    const int64_t ne1  = dst->ne[1];
-    //const int64_t ne2  = dst->ne[2];
-    //const int64_t ne3  = dst->ne[3];
-
-    const int nbk0 = k->nb[0];
-    const int nbk1 = k->nb[1];
-    const int nbk2 = k->nb[2];
-    const int nbk3 = k->nb[3];
-
-    const int nbq0 = q->nb[0];
-    const int nbq1 = q->nb[1];
-    const int nbq2 = q->nb[2];
-    const int nbq3 = q->nb[3];
-
-    const int nbv0 = v->nb[0];
-    const int nbv1 = v->nb[1];
-    const int nbv2 = v->nb[2];
-    const int nbv3 = v->nb[3];
-
-    const int nb0  = dst->nb[0];
-    const int nb1  = dst->nb[1];
-    const int nb2  = dst->nb[2];
-    const int nb3  = dst->nb[3];
+    GGML_TENSOR_LOCALS(int64_t, neq, q,   ne);
+    GGML_TENSOR_LOCALS(size_t,  nbq, q,   nb);
+    GGML_TENSOR_LOCALS(int64_t, nek, k,   ne);
+    GGML_TENSOR_LOCALS(size_t,  nbk, k,   nb);
+    GGML_TENSOR_LOCALS(int64_t, nev, v,   ne);
+    GGML_TENSOR_LOCALS(size_t,  nbv, v,   nb);
+    GGML_TENSOR_LOCALS(int64_t, ne,  dst, ne);
+    GGML_TENSOR_LOCALS(size_t,  nb,  dst, nb);
 
     const int ith = params->ith;
     const int nth = params->nth;
@@ -14168,65 +13746,18 @@ static void ggml_compute_forward_flash_ff_f16(
     int64_t t0 = ggml_perf_time_us();
     UNUSED(t0);
 
-    const int64_t nea0 = a->ne[0];
-    const int64_t nea1 = a->ne[1];
-    const int64_t nea2 = a->ne[2];
-    const int64_t nea3 = a->ne[3];
-
-    const int64_t neb00 = b0->ne[0];
-    const int64_t neb01 = b0->ne[1];
-    //const int64_t neb02 = b0->ne[2];
-    //const int64_t neb03 = b0->ne[3];
-
-    const int64_t neb10 = b1->ne[0];
-    const int64_t neb11 = b1->ne[1];
-    //const int64_t neb12 = b1->ne[2];
-    //const int64_t neb13 = b1->ne[3];
-
-    const int64_t nec00 = c0->ne[0];
-    const int64_t nec01 = c0->ne[1];
-    //const int64_t nec02 = c0->ne[2];
-    //const int64_t nec03 = c0->ne[3];
-
-    const int64_t nec10 = c1->ne[0];
-    const int64_t nec11 = c1->ne[1];
-    //const int64_t nec12 = c1->ne[2];
-    //const int64_t nec13 = c1->ne[3];
-
-    const int64_t ne0 = dst->ne[0];
-    const int64_t ne1 = dst->ne[1];
-    const int64_t ne2 = dst->ne[2];
-    //const int64_t ne3 = dst->ne[3];
-
-    const int nba0 = a->nb[0];
-    const int nba1 = a->nb[1];
-    const int nba2 = a->nb[2];
-    const int nba3 = a->nb[3];
-
-    const int nbb00 = b0->nb[0];
-    const int nbb01 = b0->nb[1];
-    const int nbb02 = b0->nb[2];
-    const int nbb03 = b0->nb[3];
-
-    const int nbb10 = b1->nb[0];
-    //const int nbb11 = b1->nb[1];
-    //const int nbb12 = b1->nb[2];
-    //const int nbb13 = b1->nb[3];
-
-    const int nbc00 = c0->nb[0];
-    const int nbc01 = c0->nb[1];
-    const int nbc02 = c0->nb[2];
-    const int nbc03 = c0->nb[3];
-
-    const int nbc10 = c1->nb[0];
-    //const int nbc11 = c1->nb[1];
-    //const int nbc12 = c1->nb[2];
-    //const int nbc13 = c1->nb[3];
-
-    const int nb0 = dst->nb[0];
-    const int nb1 = dst->nb[1];
-    const int nb2 = dst->nb[2];
-    const int nb3 = dst->nb[3];
+    GGML_TENSOR_LOCALS(int64_t, nea,  a,   ne);
+    GGML_TENSOR_LOCALS(size_t,  nba,  a,   nb);
+    GGML_TENSOR_LOCALS(int64_t, neb0, b0,  ne);
+    GGML_TENSOR_LOCALS(size_t,  nbb0, b0,  nb);
+    GGML_TENSOR_LOCALS(int64_t, neb1, b1,  ne);
+    GGML_TENSOR_LOCALS(size_t,  nbb1, b1,  nb);
+    GGML_TENSOR_LOCALS(int64_t, nec0, c0,  ne);
+    GGML_TENSOR_LOCALS(size_t,  nbc0, c0,  nb);
+    GGML_TENSOR_LOCALS(int64_t, nec1, c1,  ne);
+    GGML_TENSOR_LOCALS(size_t,  nbc1, c1,  nb);
+    GGML_TENSOR_LOCALS(int64_t, ne,   dst, ne);
+    GGML_TENSOR_LOCALS(size_t,  nb,   dst, nb);
 
     const int ith = params->ith;
     const int nth = params->nth;
@@ -14374,55 +13905,16 @@ static void ggml_compute_forward_flash_attn_back_f32(
     int64_t t0 = ggml_perf_time_us();
     UNUSED(t0);
 
-    const int64_t neq0 = q->ne[0];
-    const int64_t neq1 = q->ne[1];
-    const int64_t neq2 = q->ne[2];
-    const int64_t neq3 = q->ne[3];
-
-    const int64_t nek0 = k->ne[0];
-    const int64_t nek1 = k->ne[1];
-    //const int64_t nek2 = k->ne[2];
-    //const int64_t nek3 = k->ne[3];
-
-    const int64_t nev0 = v->ne[0];
-    const int64_t nev1 = v->ne[1];
-    //const int64_t nev2 = v->ne[2];
-    //const int64_t nev3 = v->ne[3];
-
-    const int64_t ned0 = d->ne[0];
-    const int64_t ned1 = d->ne[1];
-    //const int64_t ned2 = d->ne[2];
-    //const int64_t ned3 = d->ne[3];
-
-    const int64_t ne0  = dst->ne[0];
-    const int64_t ne1  = dst->ne[1];
-    const int64_t ne2  = dst->ne[2];
-    const int64_t ne3  = dst->ne[3];
-
-    const int nbk0 = k->nb[0];
-    const int nbk1 = k->nb[1];
-    const int nbk2 = k->nb[2];
-    const int nbk3 = k->nb[3];
-
-    const int nbq0 = q->nb[0];
-    const int nbq1 = q->nb[1];
-    const int nbq2 = q->nb[2];
-    const int nbq3 = q->nb[3];
-
-    const int nbv0 = v->nb[0];
-    const int nbv1 = v->nb[1];
-    const int nbv2 = v->nb[2];
-    const int nbv3 = v->nb[3];
-
-    const int nbd0 = d->nb[0];
-    const int nbd1 = d->nb[1];
-    const int nbd2 = d->nb[2];
-    const int nbd3 = d->nb[3];
-
-    const int nb0  = dst->nb[0];
-    const int nb1  = dst->nb[1];
-    const int nb2  = dst->nb[2];
-    const int nb3  = dst->nb[3];
+    GGML_TENSOR_LOCALS(int64_t, neq, q,   ne);
+    GGML_TENSOR_LOCALS(size_t,  nbq, q,   nb);
+    GGML_TENSOR_LOCALS(int64_t, nek, k,   ne);
+    GGML_TENSOR_LOCALS(size_t,  nbk, k,   nb);
+    GGML_TENSOR_LOCALS(int64_t, nev, v,   ne);
+    GGML_TENSOR_LOCALS(size_t,  nbv, v,   nb);
+    GGML_TENSOR_LOCALS(int64_t, ned, d,   ne);
+    GGML_TENSOR_LOCALS(size_t,  nbd, d,   nb);
+    GGML_TENSOR_LOCALS(int64_t, ne,  dst, ne);
+    GGML_TENSOR_LOCALS(size_t,  nb,  dst, nb);
 
     const int ith = params->ith;
     const int nth = params->nth;
@@ -14780,15 +14272,8 @@ static void ggml_compute_forward_win_part_f32(
         return;
     }
 
-    const int64_t ne00 = src0->ne[0]; UNUSED(ne00);
-    const int64_t ne01 = src0->ne[1];
-    const int64_t ne02 = src0->ne[2];
-    const int64_t ne03 = src0->ne[3]; UNUSED(ne03);
-
-    const int64_t ne0 = dst->ne[0];
-    const int64_t ne1 = dst->ne[1];
-    const int64_t ne2 = dst->ne[2];
-    const int64_t ne3 = dst->ne[3]; UNUSED(ne3);
+    GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne);
+    GGML_TENSOR_LOCALS(int64_t, ne,  dst,  ne);
 
     const int32_t nep0 = ((const int32_t *)(opt0->data))[0];
     const int32_t nep1 = ((const int32_t *)(opt0->data))[1];
@@ -14851,14 +14336,8 @@ static void ggml_compute_forward_win_unpart_f32(
         return;
     }
 
-    const int64_t ne00 = src0->ne[0];
-    const int64_t ne01 = src0->ne[1];
-    const int64_t ne02 = src0->ne[2];
-    //const int64_t ne03 = src0->ne[3];
-
-    const int64_t ne0 = dst->ne[0];
-    const int64_t ne1 = dst->ne[1];
-    const int64_t ne2 = dst->ne[2];
+    GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne);
+    GGML_TENSOR_LOCALS(int64_t, ne,  dst,  ne);
 
     const int32_t w = ((const int32_t *)(opt0->data))[0];
 
@@ -15399,279 +14878,295 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
     if (skip_cpu) {
         return;
     }
-    GGML_ASSERT(tensor->src0 == NULL || tensor->src0->backend == GGML_BACKEND_CPU);
-    GGML_ASSERT(tensor->src1 == NULL || tensor->src1->backend == GGML_BACKEND_CPU);
+    GGML_ASSERT(tensor->src[0] == NULL || tensor->src[0]->backend == GGML_BACKEND_CPU);
+    GGML_ASSERT(tensor->src[1] == NULL || tensor->src[1]->backend == GGML_BACKEND_CPU);
 #endif // GGML_USE_CUBLAS
 
     switch (tensor->op) {
         case GGML_OP_DUP:
             {
-                ggml_compute_forward_dup(params, tensor->src0, tensor);
+                ggml_compute_forward_dup(params, tensor->src[0], tensor);
             } break;
         case GGML_OP_ADD:
             {
-                ggml_compute_forward_add(params, tensor->src0, tensor->src1, tensor);
+                ggml_compute_forward_add(params, tensor->src[0], tensor->src[1], tensor);
             } break;
         case GGML_OP_ADD1:
             {
-                ggml_compute_forward_add1(params, tensor->src0, tensor->src1, tensor);
+                ggml_compute_forward_add1(params, tensor->src[0], tensor->src[1], tensor);
             } break;
         case GGML_OP_ACC:
             {
-                ggml_compute_forward_acc(params, tensor->src0, tensor->src1, tensor->opt[0], tensor);
+                ggml_compute_forward_acc(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor);
             } break;
         case GGML_OP_SUB:
             {
-                ggml_compute_forward_sub(params, tensor->src0, tensor->src1, tensor);
+                ggml_compute_forward_sub(params, tensor->src[0], tensor->src[1], tensor);
             } break;
         case GGML_OP_MUL:
             {
-                ggml_compute_forward_mul(params, tensor->src0, tensor->src1, tensor);
+                ggml_compute_forward_mul(params, tensor->src[0], tensor->src[1], tensor);
             } break;
         case GGML_OP_DIV:
             {
-                ggml_compute_forward_div(params, tensor->src0, tensor->src1, tensor);
+                ggml_compute_forward_div(params, tensor->src[0], tensor->src[1], tensor);
             } break;
         case GGML_OP_SQR:
             {
-                ggml_compute_forward_sqr(params, tensor->src0, tensor);
+                ggml_compute_forward_sqr(params, tensor->src[0], tensor);
             } break;
         case GGML_OP_SQRT:
             {
-                ggml_compute_forward_sqrt(params, tensor->src0, tensor);
+                ggml_compute_forward_sqrt(params, tensor->src[0], tensor);
             } break;
         case GGML_OP_LOG:
             {
-                ggml_compute_forward_log(params, tensor->src0, tensor);
+                ggml_compute_forward_log(params, tensor->src[0], tensor);
             } break;
         case GGML_OP_SUM:
             {
-                ggml_compute_forward_sum(params, tensor->src0, tensor);
+                ggml_compute_forward_sum(params, tensor->src[0], tensor);
             } break;
         case GGML_OP_SUM_ROWS:
             {
-                ggml_compute_forward_sum_rows(params, tensor->src0, tensor);
+                ggml_compute_forward_sum_rows(params, tensor->src[0], tensor);
             } break;
         case GGML_OP_MEAN:
             {
-                ggml_compute_forward_mean(params, tensor->src0, tensor);
+                ggml_compute_forward_mean(params, tensor->src[0], tensor);
+            } break;
+        case GGML_OP_ARGMAX:
+            {
+                ggml_compute_forward_argmax(params, tensor->src[0], tensor);
             } break;
         case GGML_OP_REPEAT:
             {
-                ggml_compute_forward_repeat(params, tensor->src0, tensor);
+                ggml_compute_forward_repeat(params, tensor->src[0], tensor);
             } break;
         case GGML_OP_REPEAT_BACK:
             {
-                ggml_compute_forward_repeat_back(params, tensor->src0, tensor);
+                ggml_compute_forward_repeat_back(params, tensor->src[0], tensor);
             } break;
         case GGML_OP_ABS:
             {
-                ggml_compute_forward_abs(params, tensor->src0, tensor);
+                ggml_compute_forward_abs(params, tensor->src[0], tensor);
             } break;
         case GGML_OP_SGN:
             {
-                ggml_compute_forward_sgn(params, tensor->src0, tensor);
+                ggml_compute_forward_sgn(params, tensor->src[0], tensor);
             } break;
         case GGML_OP_NEG:
             {
-                ggml_compute_forward_neg(params, tensor->src0, tensor);
+                ggml_compute_forward_neg(params, tensor->src[0], tensor);
             } break;
         case GGML_OP_STEP:
             {
-                ggml_compute_forward_step(params, tensor->src0, tensor);
+                ggml_compute_forward_step(params, tensor->src[0], tensor);
+            } break;
+        case GGML_OP_TANH:
+            {
+                ggml_compute_forward_tanh(params, tensor->src[0], tensor);
+            } break;
+        case GGML_OP_ELU:
+            {
+                ggml_compute_forward_elu(params, tensor->src[0], tensor);
             } break;
         case GGML_OP_RELU:
             {
-                ggml_compute_forward_relu(params, tensor->src0, tensor);
+                ggml_compute_forward_relu(params, tensor->src[0], tensor);
             } break;
         case GGML_OP_GELU:
             {
-                ggml_compute_forward_gelu(params, tensor->src0, tensor);
+                ggml_compute_forward_gelu(params, tensor->src[0], tensor);
             } break;
         case GGML_OP_GELU_QUICK:
             {
-                ggml_compute_forward_gelu_quick(params, tensor->src0, tensor);
+                ggml_compute_forward_gelu_quick(params, tensor->src[0], tensor);
             } break;
         case GGML_OP_SILU:
             {
-                ggml_compute_forward_silu(params, tensor->src0, tensor);
+                ggml_compute_forward_silu(params, tensor->src[0], tensor);
             } break;
         case GGML_OP_SILU_BACK:
             {
-                ggml_compute_forward_silu_back(params, tensor->src0, tensor->src1, tensor);
+                ggml_compute_forward_silu_back(params, tensor->src[0], tensor->src[1], tensor);
             } break;
         case GGML_OP_NORM:
             {
-                ggml_compute_forward_norm(params, tensor->src0, tensor);
+                ggml_compute_forward_norm(params, tensor->src[0], tensor);
             } break;
         case GGML_OP_RMS_NORM:
             {
-                ggml_compute_forward_rms_norm(params, tensor->src0, tensor);
+                ggml_compute_forward_rms_norm(params, tensor->src[0], tensor);
             } break;
         case GGML_OP_RMS_NORM_BACK:
             {
-                ggml_compute_forward_rms_norm_back(params, tensor->src0, tensor->src1, tensor);
+                ggml_compute_forward_rms_norm_back(params, tensor->src[0], tensor->src[1], tensor);
             } break;
         case GGML_OP_MUL_MAT:
             {
-                ggml_compute_forward_mul_mat(params, tensor->src0, tensor->src1, tensor);
+                ggml_compute_forward_mul_mat(params, tensor->src[0], tensor->src[1], tensor);
             } break;
         case GGML_OP_OUT_PROD:
             {
-                ggml_compute_forward_out_prod(params, tensor->src0, tensor->src1, tensor);
+                ggml_compute_forward_out_prod(params, tensor->src[0], tensor->src[1], tensor);
             } break;
         case GGML_OP_SCALE:
             {
-                ggml_compute_forward_scale(params, tensor->src0, tensor->src1, tensor);
+                ggml_compute_forward_scale(params, tensor->src[0], tensor->src[1], tensor);
             } break;
         case GGML_OP_SET:
             {
-                ggml_compute_forward_set(params, tensor->src0, tensor->src1, tensor->opt[0], tensor);
+                ggml_compute_forward_set(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor);
             } break;
         case GGML_OP_CPY:
             {
-                ggml_compute_forward_cpy(params, tensor->src0, tensor);
+                ggml_compute_forward_cpy(params, tensor->src[0], tensor);
             } break;
         case GGML_OP_CONT:
             {
-                ggml_compute_forward_cont(params, tensor->src0, tensor);
+                ggml_compute_forward_cont(params, tensor->src[0], tensor);
             } break;
         case GGML_OP_RESHAPE:
             {
-                ggml_compute_forward_reshape(params, tensor->src0, tensor);
+                ggml_compute_forward_reshape(params, tensor->src[0], tensor);
             } break;
         case GGML_OP_VIEW:
             {
-                ggml_compute_forward_view(params, tensor->src0);
+                ggml_compute_forward_view(params, tensor->src[0]);
             } break;
         case GGML_OP_PERMUTE:
             {
-                ggml_compute_forward_permute(params, tensor->src0);
+                ggml_compute_forward_permute(params, tensor->src[0]);
             } break;
         case GGML_OP_TRANSPOSE:
             {
-                ggml_compute_forward_transpose(params, tensor->src0);
+                ggml_compute_forward_transpose(params, tensor->src[0]);
             } break;
         case GGML_OP_GET_ROWS:
             {
-                ggml_compute_forward_get_rows(params, tensor->src0, tensor->src1, tensor);
+                ggml_compute_forward_get_rows(params, tensor->src[0], tensor->src[1], tensor);
             } break;
         case GGML_OP_GET_ROWS_BACK:
             {
-                ggml_compute_forward_get_rows_back(params, tensor->src0, tensor->src1, tensor->opt[0], tensor);
+                ggml_compute_forward_get_rows_back(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor);
             } break;
         case GGML_OP_DIAG:
             {
-                ggml_compute_forward_diag(params, tensor->src0, tensor);
+                ggml_compute_forward_diag(params, tensor->src[0], tensor);
             } break;
         case GGML_OP_DIAG_MASK_INF:
             {
-                ggml_compute_forward_diag_mask_inf(params, tensor->src0, tensor->src1, tensor);
+                ggml_compute_forward_diag_mask_inf(params, tensor->src[0], tensor->src[1], tensor);
             } break;
         case GGML_OP_DIAG_MASK_ZERO:
             {
-                ggml_compute_forward_diag_mask_zero(params, tensor->src0, tensor->src1, tensor);
+                ggml_compute_forward_diag_mask_zero(params, tensor->src[0], tensor->src[1], tensor);
             } break;
         case GGML_OP_SOFT_MAX:
             {
-                ggml_compute_forward_soft_max(params, tensor->src0, tensor);
+                ggml_compute_forward_soft_max(params, tensor->src[0], tensor);
             } break;
         case GGML_OP_SOFT_MAX_BACK:
             {
-                ggml_compute_forward_soft_max_back(params, tensor->src0, tensor->src1, tensor);
+                ggml_compute_forward_soft_max_back(params, tensor->src[0], tensor->src[1], tensor);
             } break;
         case GGML_OP_ROPE:
             {
-                ggml_compute_forward_rope(params, tensor->src0, tensor->src1, tensor);
+                ggml_compute_forward_rope(params, tensor->src[0], tensor->src[1], tensor);
             } break;
         case GGML_OP_ROPE_BACK:
             {
-                ggml_compute_forward_rope_back(params, tensor->src0, tensor->src1, tensor);
+                ggml_compute_forward_rope_back(params, tensor->src[0], tensor->src[1], tensor);
             } break;
         case GGML_OP_ALIBI:
             {
-                ggml_compute_forward_alibi(params, tensor->src0, tensor->src1, tensor);
+                ggml_compute_forward_alibi(params, tensor->src[0], tensor->src[1], tensor);
             } break;
         case GGML_OP_CLAMP:
             {
-                ggml_compute_forward_clamp(params, tensor->src0, tensor->src1, tensor);
+                ggml_compute_forward_clamp(params, tensor->src[0], tensor->src[1], tensor);
             } break;
-        case GGML_OP_CONV_1D_S1_PH:
+        case GGML_OP_CONV_1D:
             {
-                ggml_compute_forward_conv_1d_s1_ph(params, tensor->src0, tensor->src1, tensor);
+                ggml_compute_forward_conv_1d(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor);
             } break;
-        case GGML_OP_CONV_1D_S2_PH:
+        case GGML_OP_CONV_2D:
             {
-                ggml_compute_forward_conv_1d_s2_ph(params, tensor->src0, tensor->src1, tensor);
+                ggml_compute_forward_conv_2d(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor);
             } break;
-        case GGML_OP_CONV_2D_SK_P0:
+        case GGML_OP_POOL_1D:
             {
-                ggml_compute_forward_conv_2d_sk_p0(params, tensor->src0, tensor->src1, tensor);
+                ggml_compute_forward_pool_1d(params, tensor->src[0], tensor->src[1], tensor);
+            } break;
+        case GGML_OP_POOL_2D:
+            {
+                ggml_compute_forward_pool_2d(params, tensor->src[0], tensor->src[1], tensor);
             } break;
         case GGML_OP_FLASH_ATTN:
             {
-                const int32_t t = ggml_get_i32_1d(tensor->opt[1], 0);
+                const int32_t t = ggml_get_i32_1d(tensor->src[3], 0);
                 GGML_ASSERT(t == 0 || t == 1);
                 const bool masked = t != 0;
-                ggml_compute_forward_flash_attn(params, tensor->src0, tensor->src1, tensor->opt[0], masked, tensor);
+                ggml_compute_forward_flash_attn(params, tensor->src[0], tensor->src[1], tensor->src[2], masked, tensor);
             } break;
         case GGML_OP_FLASH_FF:
             {
-                ggml_compute_forward_flash_ff(params, tensor->src0, tensor->src1, tensor->opt[0], tensor->opt[1], tensor->opt[2], tensor);
+                ggml_compute_forward_flash_ff(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor->src[3], tensor->src[4], tensor);
             } break;
         case GGML_OP_FLASH_ATTN_BACK:
             {
-                int32_t t = ggml_get_i32_1d(tensor->opt[2], 0);
+                int32_t t = ggml_get_i32_1d(tensor->src[4], 0);
                 GGML_ASSERT(t == 0 || t == 1);
                 bool masked = t != 0;
-                ggml_compute_forward_flash_attn_back(params, tensor->src0, tensor->src1, tensor->opt[0], tensor->opt[1], masked, tensor);
+                ggml_compute_forward_flash_attn_back(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor->src[3], masked, tensor);
             } break;
         case GGML_OP_WIN_PART:
             {
-                ggml_compute_forward_win_part(params, tensor->src0, tensor->opt[0], tensor);
+                ggml_compute_forward_win_part(params, tensor->src[0], tensor->src[2], tensor);
             } break;
         case GGML_OP_WIN_UNPART:
             {
-                ggml_compute_forward_win_unpart(params, tensor->src0, tensor->opt[0], tensor);
+                ggml_compute_forward_win_unpart(params, tensor->src[0], tensor->src[2], tensor);
             } break;
         case GGML_OP_MAP_UNARY:
             {
-                const ggml_unary_op_f32_t fun = *((ggml_unary_op_f32_t *)tensor->opt[0]->data);
-                ggml_compute_forward_map_unary(params, tensor->src0, tensor, fun);
+                const ggml_unary_op_f32_t fun = *((ggml_unary_op_f32_t *)tensor->src[2]->data);
+                ggml_compute_forward_map_unary(params, tensor->src[0], tensor, fun);
             }
             break;
         case GGML_OP_MAP_BINARY:
             {
-                const ggml_binary_op_f32_t fun = *((ggml_binary_op_f32_t *)tensor->opt[0]->data);
-                ggml_compute_forward_map_binary(params, tensor->src0, tensor->src1, tensor, fun);
+                const ggml_binary_op_f32_t fun = *((ggml_binary_op_f32_t *)tensor->src[2]->data);
+                ggml_compute_forward_map_binary(params, tensor->src[0], tensor->src[1], tensor, fun);
             }
             break;
         case GGML_OP_MAP_CUSTOM1:
             {
-                const ggml_custom1_op_f32_t fun = *((ggml_custom1_op_f32_t *)tensor->opt[0]->data);
-                ggml_compute_forward_map_custom1(params, tensor->src0, tensor, fun);
+                const ggml_custom1_op_f32_t fun = *((ggml_custom1_op_f32_t *)tensor->src[2]->data);
+                ggml_compute_forward_map_custom1(params, tensor->src[0], tensor, fun);
             }
             break;
         case GGML_OP_MAP_CUSTOM2:
             {
-                const ggml_custom2_op_f32_t fun = *((ggml_custom2_op_f32_t *)tensor->opt[0]->data);
-                ggml_compute_forward_map_custom2(params, tensor->src0, tensor->src1, tensor, fun);
+                const ggml_custom2_op_f32_t fun = *((ggml_custom2_op_f32_t *)tensor->src[2]->data);
+                ggml_compute_forward_map_custom2(params, tensor->src[0], tensor->src[1], tensor, fun);
             }
             break;
         case GGML_OP_MAP_CUSTOM3:
             {
-                const ggml_custom3_op_f32_t fun = *((ggml_custom3_op_f32_t *)tensor->opt[0]->data);
-                ggml_compute_forward_map_custom3(params, tensor->src0, tensor->src1, tensor->opt[1], tensor, fun);
+                const ggml_custom3_op_f32_t fun = *((ggml_custom3_op_f32_t *)tensor->src[2]->data);
+                ggml_compute_forward_map_custom3(params, tensor->src[0], tensor->src[1], tensor->src[3], tensor, fun);
             }
             break;
         case GGML_OP_CROSS_ENTROPY_LOSS:
             {
-                ggml_compute_forward_cross_entropy_loss(params, tensor->src0, tensor->src1, tensor);
+                ggml_compute_forward_cross_entropy_loss(params, tensor->src[0], tensor->src[1], tensor);
             }
             break;
         case GGML_OP_CROSS_ENTROPY_LOSS_BACK:
             {
-                ggml_compute_forward_cross_entropy_loss_back(params, tensor->src0, tensor->src1, tensor->opt[0], tensor);
+                ggml_compute_forward_cross_entropy_loss_back(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor);
             }
             break;
         case GGML_OP_NONE:
@@ -15688,8 +15183,8 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
 ////////////////////////////////////////////////////////////////////////////////
 
 static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor * tensor, bool inplace) {
-    struct ggml_tensor * src0 = tensor->src0;
-    struct ggml_tensor * src1 = tensor->src1;
+    struct ggml_tensor * src0 = tensor->src[0];
+    struct ggml_tensor * src1 = tensor->src[1];
 
     switch (tensor->op) {
         case GGML_OP_DUP:
@@ -15725,12 +15220,12 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
                     src0->grad = ggml_add_impl(ctx, src0->grad, tensor->grad, inplace);
                 }
                 if (src1->grad) {
-                    GGML_ASSERT(ggml_nelements(tensor->opt[0]) == 5);
-                    GGML_ASSERT(tensor->opt[0]->type == GGML_TYPE_I32);
-                    const size_t nb1     = (( int32_t * ) tensor->opt[0]->data)[0];
-                    const size_t nb2     = (( int32_t * ) tensor->opt[0]->data)[1];
-                    const size_t nb3     = (( int32_t * ) tensor->opt[0]->data)[2];
-                    const size_t offset  = (( int32_t * ) tensor->opt[0]->data)[3];
+                    GGML_ASSERT(ggml_nelements(tensor->src[2]) == 5);
+                    GGML_ASSERT(tensor->src[2]->type == GGML_TYPE_I32);
+                    const size_t nb1     = (( int32_t * ) tensor->src[2]->data)[0];
+                    const size_t nb2     = (( int32_t * ) tensor->src[2]->data)[1];
+                    const size_t nb3     = (( int32_t * ) tensor->src[2]->data)[2];
+                    const size_t offset  = (( int32_t * ) tensor->src[2]->data)[3];
 
                     struct ggml_tensor * tensor_grad_view = ggml_view_4d(ctx,
                         tensor->grad,
@@ -15855,6 +15350,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
                 }
             } break;
         case GGML_OP_MEAN:
+        case GGML_OP_ARGMAX:
             {
                 GGML_ASSERT(false); // TODO: implement
             } break;
@@ -15908,6 +15404,14 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
                     // noop
                 }
             } break;
+        case GGML_OP_TANH:
+            {
+                GGML_ASSERT(false); // TODO: not implemented
+            } break;
+        case GGML_OP_ELU:
+            {
+                GGML_ASSERT(false); // TODO: not implemented
+            } break;
         case GGML_OP_RELU:
             {
                 if (src0->grad) {
@@ -15927,14 +15431,6 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
             {
                 GGML_ASSERT(false); // TODO: not implemented
             } break;
-        case GGML_OP_ALIBI:
-            {
-                GGML_ASSERT(false); // TODO: not implemented
-            } break;
-        case GGML_OP_CLAMP:
-            {
-                GGML_ASSERT(false); // TODO: not implemented
-            } break;
         case GGML_OP_SILU:
             {
                 // necessary for llama
@@ -16037,12 +15533,12 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
             } break;
         case GGML_OP_SET:
             {
-                GGML_ASSERT(ggml_nelements(tensor->opt[0]) == 5);
-                GGML_ASSERT(tensor->opt[0]->type == GGML_TYPE_I32);
-                const size_t nb1     = (( int32_t * ) tensor->opt[0]->data)[0];
-                const size_t nb2     = (( int32_t * ) tensor->opt[0]->data)[1];
-                const size_t nb3     = (( int32_t * ) tensor->opt[0]->data)[2];
-                const size_t offset  = (( int32_t * ) tensor->opt[0]->data)[3];
+                GGML_ASSERT(ggml_nelements(tensor->src[2]) == 5);
+                GGML_ASSERT(tensor->src[2]->type == GGML_TYPE_I32);
+                const size_t nb1     = (( int32_t * ) tensor->src[2]->data)[0];
+                const size_t nb2     = (( int32_t * ) tensor->src[2]->data)[1];
+                const size_t nb3     = (( int32_t * ) tensor->src[2]->data)[2];
+                const size_t offset  = (( int32_t * ) tensor->src[2]->data)[3];
 
                 struct ggml_tensor * tensor_grad_view = NULL;
 
@@ -16119,8 +15615,8 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
                 if (src0->grad) {
                     size_t offset;
 
-                    GGML_ASSERT(sizeof(offset) <= ggml_nbytes(tensor->opt[0]));
-                    memcpy(&offset, tensor->opt[0]->data, sizeof(offset));
+                    GGML_ASSERT(sizeof(offset) <= ggml_nbytes(tensor->src[2]));
+                    memcpy(&offset, tensor->src[2]->data, sizeof(offset));
 
                     size_t nb1     = tensor->nb[1];
                     size_t nb2     = tensor->nb[2];
@@ -16147,7 +15643,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
             {
                 // necessary for llama
                 if (src0->grad) {
-                    int32_t * axes = (int32_t *) tensor->opt[0]->data;
+                    int32_t * axes = (int32_t *) tensor->src[2]->data;
                     int axis0 = axes[0] & 0x3;
                     int axis1 = axes[1] & 0x3;
                     int axis2 = axes[2] & 0x3;
@@ -16251,7 +15747,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
                 // necessary for llama
                 if (src0->grad) {
                     assert(src1->type == GGML_TYPE_I32);
-                    assert(ggml_nelements(src1) == 3);
+                    assert(ggml_nelements(src1) == 6);
                     const int n_past = ((int32_t *) src1->data)[0];
                     const int n_dims = ((int32_t *) src1->data)[1];
                     const int mode   = ((int32_t *) src1->data)[2];
@@ -16272,7 +15768,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
             {
                 if (src0->grad) {
                     assert(src1->type == GGML_TYPE_I32);
-                    assert(ggml_nelements(src1) == 4);
+                    assert(ggml_nelements(src1) == 3);
                     const int n_past = ((int32_t *) src1->data)[0];
                     const int n_dims = ((int32_t *) src1->data)[1];
                     const int mode   = ((int32_t *) src1->data)[2];
@@ -16291,30 +15787,42 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
                     // noop
                 }
             } break;
-        case GGML_OP_CONV_1D_S1_PH:
+        case GGML_OP_ALIBI:
             {
                 GGML_ASSERT(false); // TODO: not implemented
             } break;
-        case GGML_OP_CONV_1D_S2_PH:
+        case GGML_OP_CLAMP:
             {
                 GGML_ASSERT(false); // TODO: not implemented
             } break;
-        case GGML_OP_CONV_2D_SK_P0:
+        case GGML_OP_CONV_1D:
+            {
+                GGML_ASSERT(false); // TODO: not implemented
+            } break;
+        case GGML_OP_CONV_2D:
+            {
+                GGML_ASSERT(false); // TODO: not implemented
+            } break;
+        case GGML_OP_POOL_1D:
+            {
+                GGML_ASSERT(false); // TODO: not implemented
+            } break;
+        case GGML_OP_POOL_2D:
             {
                 GGML_ASSERT(false); // TODO: not implemented
             } break;
         case GGML_OP_FLASH_ATTN:
             {
                 struct ggml_tensor * flash_grad = NULL;
-                if (src0->grad || src1->grad || tensor->opt[0]->grad) {
-                    int32_t t = ggml_get_i32_1d(tensor->opt[1], 0);
+                if (src0->grad || src1->grad || tensor->src[2]->grad) {
+                    int32_t t = ggml_get_i32_1d(tensor->src[3], 0);
                     GGML_ASSERT(t == 0 || t == 1);
                     bool masked = t != 0;
                     flash_grad =
                         ggml_flash_attn_back(ctx,
                             src0,
                             src1,
-                            tensor->opt[0],
+                            tensor->src[2],
                             tensor->grad,
                             masked);
                 }
@@ -16411,7 +15919,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
                             inplace);
                 }
 
-                struct ggml_tensor * opt0 = tensor->opt[0];
+                struct ggml_tensor * opt0 = tensor->src[2];
 
                 if (opt0->grad) {
                     struct ggml_tensor * grad_v = NULL;
@@ -16527,17 +16035,9 @@ static void ggml_visit_parents(struct ggml_cgraph * cgraph, struct ggml_tensor *
         }
     }
 
-    if (node->src0) {
-        ggml_visit_parents(cgraph, node->src0);
-    }
-
-    if (node->src1) {
-        ggml_visit_parents(cgraph, node->src1);
-    }
-
-    for (int i = 0; i < GGML_MAX_OPT; ++i) {
-        if (node->opt[i]) {
-            ggml_visit_parents(cgraph, node->opt[i]);
+    for (int i = 0; i < GGML_MAX_SRC; ++i) {
+        if (node->src[i]) {
+            ggml_visit_parents(cgraph, node->src[i]);
         }
     }
 
@@ -16592,9 +16092,6 @@ struct ggml_cgraph ggml_build_forward(struct ggml_tensor * tensor) {
     struct ggml_cgraph result = {
         /*.n_nodes      =*/ 0,
         /*.n_leafs      =*/ 0,
-        /*.n_threads    =*/ GGML_DEFAULT_N_THREADS,
-        /*.work_size    =*/ 0,
-        /*.work         =*/ NULL,
         /*.nodes        =*/ { NULL },
         /*.grads        =*/ { NULL },
         /*.leafs        =*/ { NULL },
@@ -16765,16 +16262,20 @@ void clear_numa_thread_affinity(void) {}
 #endif
 
 struct ggml_compute_state_shared {
-    struct ggml_cgraph * cgraph;
+    const struct ggml_cgraph * cgraph;
+    const struct ggml_cplan  * cplan;
 
     int64_t perf_node_start_cycles;
     int64_t perf_node_start_time_us;
 
-    int n_threads;
+    const int n_threads;
 
     // synchronization primitives
     atomic_int n_active; // num active threads
     atomic_int node_n;   // active graph node
+
+    bool (*abort_callback)(void * data); // abort ggml_graph_compute when true
+    void * abort_callback_data;
 };
 
 struct ggml_compute_state {
@@ -16794,14 +16295,22 @@ static void ggml_graph_compute_perf_stats_node(struct ggml_tensor * node, const
 
 static thread_ret_t ggml_graph_compute_thread(void * data) {
     struct ggml_compute_state * state = (struct ggml_compute_state *) data;
-    struct ggml_cgraph * cgraph = state->shared->cgraph;
 
-    const int n_threads = state->shared->n_threads;
+    const struct ggml_cgraph * cgraph = state->shared->cgraph;
+    const struct ggml_cplan  * cplan  = state->shared->cplan;
+
+    const int * n_tasks_arr = cplan->n_tasks;
+    const int   n_threads   = state->shared->n_threads;
+
     set_numa_thread_affinity(state->ith, n_threads);
 
     int node_n = -1;
 
     while (true) {
+        if (cplan->abort_callback && cplan->abort_callback(cplan->abort_callback_data)) {
+            state->shared->node_n += 1;
+            return (thread_ret_t) GGML_EXIT_ABORTED;
+        }
         if (atomic_fetch_sub(&state->shared->n_active, 1) == 1) {
             // all other threads are finished and spinning
             // do finalize and init here so we don't have synchronize again
@@ -16809,15 +16318,17 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
                 /*.type  =*/ GGML_TASK_FINALIZE,
                 /*.ith   =*/ 0,
                 /*.nth   =*/ 0,
-                /*.wsize =*/ cgraph->work ? ggml_nbytes(cgraph->work) : 0,
-                /*.wdata =*/ cgraph->work ? cgraph->work->data : NULL,
+                /*.wsize =*/ cplan->work_size,
+                /*.wdata =*/ cplan->work_data,
             };
 
             if (node_n != -1) {
                 /* FINALIZE */
                 struct ggml_tensor * node = state->shared->cgraph->nodes[node_n];
-                params.nth = node->n_tasks;
-                ggml_compute_forward(&params, node);
+                if (GGML_OP_HAS_FINALIZE[node->op]) {
+                    params.nth = n_tasks_arr[node_n];
+                    ggml_compute_forward(&params, node);
+                }
                 ggml_graph_compute_perf_stats_node(node, state->shared);
             }
 
@@ -16826,27 +16337,38 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
                 GGML_PRINT_DEBUG_5("%s: %d/%d\n", __func__, node_n, cgraph->n_nodes);
 
                 struct ggml_tensor * node = cgraph->nodes[node_n];
+                const int n_tasks = n_tasks_arr[node_n];
 
                 state->shared->perf_node_start_cycles  = ggml_perf_cycles();
                 state->shared->perf_node_start_time_us = ggml_perf_time_us();
 
-                /* INIT */
-                params.type = GGML_TASK_INIT;
-                params.nth  = node->n_tasks;
-                ggml_compute_forward(&params, node);
+                params.nth = n_tasks;
 
-                if (node->n_tasks == 1) {
+                /* INIT */
+                if (GGML_OP_HAS_INIT[node->op]) {
+                    params.type = GGML_TASK_INIT;
+                    ggml_compute_forward(&params, node);
+                }
+
+                if (n_tasks == 1) {
                     // TODO: maybe push node_n to the atomic but if other threads see n_tasks is 1,
                     // they do something more efficient than spinning (?)
                     params.type = GGML_TASK_COMPUTE;
                     ggml_compute_forward(&params, node);
 
-                    params.type = GGML_TASK_FINALIZE;
-                    ggml_compute_forward(&params, node);
+                    if (GGML_OP_HAS_FINALIZE[node->op]) {
+                        params.type = GGML_TASK_FINALIZE;
+                        ggml_compute_forward(&params, node);
+                    }
+
                     ggml_graph_compute_perf_stats_node(node, state->shared);
                 } else {
                     break;
                 }
+
+                if (cplan->abort_callback && cplan->abort_callback(cplan->abort_callback_data)) {
+                    break;
+                }
             }
 
             atomic_store(&state->shared->n_active, n_threads);
@@ -16855,7 +16377,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
             // wait for other threads to finish
             const int last = node_n;
             do {
-                sched_yield();
+                //sched_yield();
                 node_n = atomic_load(&state->shared->node_n);
             } while (node_n == last);
         }
@@ -16865,382 +16387,398 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
 
         /* COMPUTE */
         struct ggml_tensor * node = cgraph->nodes[node_n];
+        const int n_tasks = n_tasks_arr[node_n];
 
         struct ggml_compute_params params = {
             /*.type  =*/ GGML_TASK_COMPUTE,
             /*.ith   =*/ state->ith,
-            /*.nth   =*/ node->n_tasks,
-            /*.wsize =*/ cgraph->work ? ggml_nbytes(cgraph->work) : 0,
-            /*.wdata =*/ cgraph->work ? cgraph->work->data : NULL,
+            /*.nth   =*/ n_tasks,
+            /*.wsize =*/ cplan->work_size,
+            /*.wdata =*/ cplan->work_data,
         };
 
-        if (state->ith < node->n_tasks) {
+        if (state->ith < n_tasks) {
             ggml_compute_forward(&params, node);
         }
     }
 
-    return 0;
+    return GGML_EXIT_SUCCESS;
 }
 
-void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) {
-    const int n_threads = cgraph->n_threads;
+struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
+    if (n_threads <= 0) {
+        n_threads = GGML_DEFAULT_N_THREADS;
+    }
+
+    size_t work_size = 0;
+
+    struct ggml_cplan cplan;
+    memset(&cplan, 0, sizeof(struct ggml_cplan));
+
+    // thread scheduling for the different operations + work buffer size estimation
+    for (int i = 0; i < cgraph->n_nodes; i++) {
+        int n_tasks = 1;
+
+        struct ggml_tensor * node = cgraph->nodes[i];
+
+        switch (node->op) {
+            case GGML_OP_CPY:
+            case GGML_OP_DUP:
+                {
+                    n_tasks = n_threads;
+
+                    size_t cur = 0;
+                    if (ggml_is_quantized(node->type)) {
+                        cur = GGML_TYPE_SIZE[GGML_TYPE_F32] * node->ne[0] * n_tasks;
+                    }
+
+                    work_size = MAX(work_size, cur);
+                } break;
+            case GGML_OP_ADD:
+            case GGML_OP_ADD1:
+                {
+                    n_tasks = n_threads;
+
+                    size_t cur = 0;
+
+                    if (ggml_is_quantized(node->src[0]->type)) {
+                        cur = GGML_TYPE_SIZE[GGML_TYPE_F32] * node->src[0]->ne[0] * n_tasks;
+                    }
+
+                    work_size = MAX(work_size, cur);
+                } break;
+            case GGML_OP_ACC:
+                {
+                    n_tasks = n_threads;
+
+                    size_t cur = 0;
+
+                    if (ggml_is_quantized(node->src[0]->type)) {
+                        cur = GGML_TYPE_SIZE[GGML_TYPE_F32] * node->src[1]->ne[0] * n_tasks;
+                    }
+
+                    work_size = MAX(work_size, cur);
+                } break;
+            case GGML_OP_SUB:
+            case GGML_OP_DIV:
+            case GGML_OP_SQR:
+            case GGML_OP_SQRT:
+            case GGML_OP_LOG:
+            case GGML_OP_SUM:
+            case GGML_OP_SUM_ROWS:
+            case GGML_OP_MEAN:
+            case GGML_OP_ARGMAX:
+            case GGML_OP_REPEAT:
+            case GGML_OP_REPEAT_BACK:
+            case GGML_OP_ABS:
+            case GGML_OP_SGN:
+            case GGML_OP_NEG:
+            case GGML_OP_STEP:
+            case GGML_OP_TANH:
+            case GGML_OP_ELU:
+            case GGML_OP_RELU:
+                {
+                    n_tasks = 1;
+                } break;
+            case GGML_OP_MUL:
+            case GGML_OP_GELU:
+            case GGML_OP_GELU_QUICK:
+            case GGML_OP_SILU:
+            case GGML_OP_SILU_BACK:
+            case GGML_OP_NORM:
+            case GGML_OP_RMS_NORM:
+            case GGML_OP_RMS_NORM_BACK:
+                {
+                    n_tasks = n_threads;
+                } break;
+            case GGML_OP_MUL_MAT:
+            case GGML_OP_OUT_PROD:
+                {
+                    n_tasks = n_threads;
+
+                    // TODO: use different scheduling for different matrix sizes
+                    //const int nr0 = ggml_nrows(node->src[0]);
+                    //const int nr1 = ggml_nrows(node->src[1]);
+
+                    //n_tasks = MIN(n_threads, MAX(1, nr0/128));
+                    //printf("nr0 = %8d, nr1 = %8d, nr0*nr1 = %8d, n_tasks%d\n", nr0, nr1, nr0*nr1, n_tasks);
+
+                    size_t cur = 0;
+                    const enum ggml_type vec_dot_type = type_traits[node->src[0]->type].vec_dot_type;
+
+#if defined(GGML_USE_CUBLAS)
+                    if (ggml_cuda_can_mul_mat(node->src[0], node->src[1], node)) {
+                        n_tasks = 1; // TODO: this actually is doing nothing
+                                     //       the threads are still spinning
+                    } else
+#elif defined(GGML_USE_CLBLAST)
+                    if (ggml_cl_can_mul_mat(node->src[0], node->src[1], node)) {
+                        n_tasks = 1; // TODO: this actually is doing nothing
+                                     //       the threads are still spinning
+                        cur = ggml_cl_mul_mat_get_wsize(node->src[0], node->src[1], node);
+                    } else
+#endif
+#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
+                    if (ggml_compute_forward_mul_mat_use_blas(node->src[0], node->src[1], node)) {
+                        n_tasks = 1; // TODO: this actually is doing nothing
+                                     //       the threads are still spinning
+                        if (node->src[0]->type != GGML_TYPE_F32) {
+                            // here we need memory just for single 2D matrix from src0
+                            cur = GGML_TYPE_SIZE[GGML_TYPE_F32]*(node->src[0]->ne[0]*node->src[0]->ne[1]);
+                        }
+                    } else
+#endif
+                    if (node->src[1]->type != vec_dot_type) {
+                        cur = GGML_TYPE_SIZE[vec_dot_type]*ggml_nelements(node->src[1])/GGML_BLCK_SIZE[vec_dot_type];
+                    } else {
+                        cur = 0;
+                    }
+
+                    work_size = MAX(work_size, cur);
+                } break;
+            case GGML_OP_SCALE:
+                {
+                    n_tasks = 1;
+                } break;
+            case GGML_OP_SET:
+            case GGML_OP_CONT:
+            case GGML_OP_RESHAPE:
+            case GGML_OP_VIEW:
+            case GGML_OP_PERMUTE:
+            case GGML_OP_TRANSPOSE:
+            case GGML_OP_GET_ROWS:
+            case GGML_OP_GET_ROWS_BACK:
+            case GGML_OP_DIAG:
+            case GGML_OP_DIAG_MASK_ZERO:
+                {
+                    n_tasks = 1;
+                } break;
+            case GGML_OP_DIAG_MASK_INF:
+            case GGML_OP_SOFT_MAX:
+            case GGML_OP_SOFT_MAX_BACK:
+            case GGML_OP_ROPE:
+            case GGML_OP_ROPE_BACK:
+                {
+                    n_tasks = n_threads;
+                } break;
+            case GGML_OP_ALIBI:
+                {
+                    n_tasks = 1; //TODO
+                } break;
+            case GGML_OP_CLAMP:
+                {
+                    n_tasks = 1; //TODO
+                } break;
+            case GGML_OP_CONV_1D:
+                {
+                    n_tasks = n_threads;
+
+                    GGML_ASSERT(node->src[0]->ne[3] == 1);
+                    GGML_ASSERT(node->src[1]->ne[2] == 1);
+                    GGML_ASSERT(node->src[1]->ne[3] == 1);
+
+                    size_t cur = 0;
+                    const int nk = node->src[0]->ne[0];
+
+                    if (node->src[0]->type == GGML_TYPE_F16 &&
+                            node->src[1]->type == GGML_TYPE_F32) {
+                        cur = sizeof(ggml_fp16_t)*(
+                                nk*ggml_up32(node->src[0]->ne[1])*node->src[0]->ne[2] +
+                                ( 2*(nk/2) + node->src[1]->ne[0])*node->src[1]->ne[1]
+                                );
+                    } else if (node->src[0]->type == GGML_TYPE_F32 &&
+                            node->src[1]->type == GGML_TYPE_F32) {
+                        cur = sizeof(float)*(
+                                nk*ggml_up32(node->src[0]->ne[1])*node->src[0]->ne[2] +
+                                ( 2*(nk/2) + node->src[1]->ne[0])*node->src[1]->ne[1]
+                                );
+                    } else {
+                        GGML_ASSERT(false);
+                    }
+
+                    work_size = MAX(work_size, cur);
+                } break;
+            case GGML_OP_CONV_2D:
+                {
+                    n_tasks = n_threads;
+
+                    const int64_t ne00 = node->src[0]->ne[0]; // W
+                    const int64_t ne01 = node->src[0]->ne[1]; // H
+                    const int64_t ne02 = node->src[0]->ne[2]; // C
+                    const int64_t ne03 = node->src[0]->ne[3]; // N
+
+                    const int64_t ne10 = node->src[1]->ne[0]; // W
+                    const int64_t ne11 = node->src[1]->ne[1]; // H
+                    const int64_t ne12 = node->src[1]->ne[2]; // C
+
+                    const int64_t ne0 = node->ne[0];
+                    const int64_t ne1 = node->ne[1];
+                    const int64_t ne2 = node->ne[2];
+                    const int64_t nk = ne00*ne01;
+                    const int64_t ew0 = nk * ne02;
+
+                    UNUSED(ne03);
+                    UNUSED(ne2);
+
+                    size_t cur = 0;
+
+                    if (node->src[0]->type == GGML_TYPE_F16 &&
+                        node->src[1]->type == GGML_TYPE_F32) {
+                        cur = sizeof(ggml_fp16_t)*(ne0*ne1*ew0);
+                    } else if (node->src[0]->type == GGML_TYPE_F32 &&
+                               node->src[1]->type == GGML_TYPE_F32) {
+                        cur = sizeof(float)*      (ne10*ne11*ne12);
+                    } else {
+                        GGML_ASSERT(false);
+                    }
+
+                    work_size = MAX(work_size, cur);
+                } break;
+            case GGML_OP_POOL_1D:
+            case GGML_OP_POOL_2D:
+                {
+                    n_tasks = 1;
+                } break;
+            case GGML_OP_FLASH_ATTN:
+                {
+                    n_tasks = n_threads;
+
+                    size_t cur = 0;
+
+                    const int64_t ne11 = ggml_up(node->src[1]->ne[1], GGML_SOFT_MAX_UNROLL);
+
+                    if (node->src[1]->type == GGML_TYPE_F32) {
+                        cur  = sizeof(float)*ne11*n_tasks; // TODO: this can become (n_tasks-1)
+                        cur += sizeof(float)*ne11*n_tasks; // this is overestimated by x2
+                    }
+
+                    if (node->src[1]->type == GGML_TYPE_F16) {
+                        cur  = sizeof(float)*ne11*n_tasks; // TODO: this can become (n_tasks-1)
+                        cur += sizeof(float)*ne11*n_tasks; // this is overestimated by x2
+                    }
+
+                    work_size = MAX(work_size, cur);
+                } break;
+            case GGML_OP_FLASH_FF:
+                {
+                    n_tasks = n_threads;
+
+                    size_t cur = 0;
+
+                    if (node->src[1]->type == GGML_TYPE_F32) {
+                        cur  = sizeof(float)*node->src[1]->ne[1]*n_tasks; // TODO: this can become (n_tasks-1)
+                        cur += sizeof(float)*node->src[1]->ne[1]*n_tasks; // this is overestimated by x2
+                    }
+
+                    if (node->src[1]->type == GGML_TYPE_F16) {
+                        cur  = sizeof(float)*node->src[1]->ne[1]*n_tasks; // TODO: this can become (n_tasks-1)
+                        cur += sizeof(float)*node->src[1]->ne[1]*n_tasks; // this is overestimated by x2
+                    }
+
+                    work_size = MAX(work_size, cur);
+                } break;
+            case GGML_OP_FLASH_ATTN_BACK:
+                {
+                    n_tasks = n_threads;
+
+                    size_t cur = 0;
+
+                    const int64_t    D = node->src[0]->ne[0];
+                    const int64_t ne11 = ggml_up(node->src[1]->ne[1], GGML_SOFT_MAX_UNROLL);
+                    const int64_t mxDn = MAX(D, ne11) * 2; // *2 because of S and SM in ggml_compute_forward_flash_attn_back
+                    if (node->src[1]->type == GGML_TYPE_F32) {
+                        cur  = sizeof(float)*mxDn*n_tasks; // TODO: this can become (n_tasks-1)
+                        cur += sizeof(float)*mxDn*n_tasks; // this is overestimated by x2
+                    }
+
+                    if (node->src[1]->type == GGML_TYPE_F16) {
+                        cur  = sizeof(float)*mxDn*n_tasks; // TODO: this can become (n_tasks-1)
+                        cur += sizeof(float)*mxDn*n_tasks; // this is overestimated by x2
+                    }
+
+                    work_size = MAX(work_size, cur);
+                } break;
+            case GGML_OP_WIN_PART:
+            case GGML_OP_WIN_UNPART:
+            case GGML_OP_MAP_UNARY:
+            case GGML_OP_MAP_BINARY:
+            case GGML_OP_MAP_CUSTOM1:
+            case GGML_OP_MAP_CUSTOM2:
+            case GGML_OP_MAP_CUSTOM3:
+                {
+                    n_tasks = 1;
+                } break;
+            case GGML_OP_CROSS_ENTROPY_LOSS:
+                {
+                    n_tasks = n_threads;
+
+                    size_t cur = ggml_type_size(node->type)*(n_tasks + node->src[0]->ne[0]*n_tasks);
+
+                    work_size = MAX(work_size, cur);
+                } break;
+            case GGML_OP_CROSS_ENTROPY_LOSS_BACK:
+                {
+                    n_tasks = n_threads;
+
+                    size_t cur = ggml_type_size(node->type)*node->src[0]->ne[0]*n_tasks;
+
+                    work_size = MAX(work_size, cur);
+                } break;
+            case GGML_OP_NONE:
+                {
+                    n_tasks = 1;
+                } break;
+            case GGML_OP_COUNT:
+                {
+                    GGML_ASSERT(false);
+                } break;
+        }
+
+        cplan.n_tasks[i] = n_tasks;
+    }
+
+    if (work_size > 0) {
+        work_size += CACHE_LINE_SIZE*(n_threads - 1);
+    }
+
+    cplan.n_threads = n_threads;
+    cplan.work_size = work_size;
+    cplan.work_data = NULL;
+
+    return cplan;
+}
+
+int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
+    {
+        GGML_ASSERT(cplan);
+        GGML_ASSERT(cplan->n_threads > 0);
+
+        if (cplan->work_size > 0) {
+            GGML_ASSERT(cplan->work_data);
+        }
+
+        for (int i = 0; i < cgraph->n_nodes; ++i) {
+            if (cgraph->nodes[i]->op != GGML_OP_NONE) {
+                GGML_ASSERT(cplan->n_tasks[i] > 0);
+            }
+        }
+    }
+
+    const int n_threads = cplan->n_threads;
 
     struct ggml_compute_state_shared state_shared = {
         /*.cgraph                  =*/ cgraph,
+        /*.cgraph_plan             =*/ cplan,
         /*.perf_node_start_cycles  =*/ 0,
         /*.perf_node_start_time_us =*/ 0,
         /*.n_threads               =*/ n_threads,
         /*.n_active                =*/ n_threads,
         /*.node_n                  =*/ -1,
+        /*.abort_callback          =*/ NULL,
+        /*.abort_callback_data     =*/ NULL,
     };
     struct ggml_compute_state * workers = alloca(sizeof(struct ggml_compute_state)*n_threads);
 
-    // initialize tasks + work buffer
-    {
-        size_t work_size = 0;
-
-        // thread scheduling for the different operations
-        for (int i = 0; i < cgraph->n_nodes; i++) {
-            struct ggml_tensor * node = cgraph->nodes[i];
-
-            switch (node->op) {
-                case GGML_OP_CPY:
-                case GGML_OP_DUP:
-                    {
-                        node->n_tasks = n_threads;
-
-                        size_t cur = 0;
-                        if (ggml_is_quantized(node->type)) {
-                            cur = GGML_TYPE_SIZE[GGML_TYPE_F32] * node->ne[0] * n_threads;
-                        }
-
-                        work_size = MAX(work_size, cur);
-                    } break;
-                case GGML_OP_ADD:
-                case GGML_OP_ADD1:
-                    {
-                        node->n_tasks = n_threads;
-
-                        size_t cur = 0;
-
-                        if (ggml_is_quantized(node->src0->type)) {
-                            cur = GGML_TYPE_SIZE[GGML_TYPE_F32] * node->src0->ne[0] * n_threads;
-                        }
-
-                        work_size = MAX(work_size, cur);
-                    } break;
-                case GGML_OP_ACC:
-                    {
-                        node->n_tasks = n_threads;
-
-                        size_t cur = 0;
-
-                        if (ggml_is_quantized(node->src0->type)) {
-                            cur = GGML_TYPE_SIZE[GGML_TYPE_F32] * node->src1->ne[0] * n_threads;
-                        }
-
-                        work_size = MAX(work_size, cur);
-                    } break;
-                case GGML_OP_SUB:
-                case GGML_OP_DIV:
-                case GGML_OP_SQR:
-                case GGML_OP_SQRT:
-                case GGML_OP_LOG:
-                case GGML_OP_SUM:
-                case GGML_OP_SUM_ROWS:
-                case GGML_OP_MEAN:
-                case GGML_OP_REPEAT:
-                case GGML_OP_REPEAT_BACK:
-                case GGML_OP_ABS:
-                case GGML_OP_SGN:
-                case GGML_OP_NEG:
-                case GGML_OP_STEP:
-                case GGML_OP_RELU:
-                    {
-                        node->n_tasks = 1;
-                    } break;
-                case GGML_OP_MUL:
-                case GGML_OP_GELU:
-                case GGML_OP_GELU_QUICK:
-                case GGML_OP_SILU:
-                case GGML_OP_SILU_BACK:
-                case GGML_OP_NORM:
-                case GGML_OP_RMS_NORM:
-                case GGML_OP_RMS_NORM_BACK:
-                    {
-                        node->n_tasks = n_threads;
-                    } break;
-                case GGML_OP_MUL_MAT:
-                case GGML_OP_OUT_PROD:
-                    {
-                        node->n_tasks = n_threads;
-
-                        // TODO: use different scheduling for different matrix sizes
-                        //const int nr0 = ggml_nrows(node->src0);
-                        //const int nr1 = ggml_nrows(node->src1);
-
-                        //node->n_tasks = MIN(n_threads, MAX(1, nr0/128));
-                        //printf("nr0 = %8d, nr1 = %8d, nr0*nr1 = %8d, n_tasks = %d\n", nr0, nr1, nr0*nr1, node->n_tasks);
-
-                        size_t cur = 0;
-
-#if defined(GGML_USE_CUBLAS)
-                        if (ggml_cuda_can_mul_mat(node->src0, node->src1, node)) {
-                            node->n_tasks = 1; // TODO: this actually is doing nothing
-                                                //       the threads are still spinning
-                        }
-                        else
-#elif defined(GGML_USE_CLBLAST)
-                        if (ggml_cl_can_mul_mat(node->src0, node->src1, node)) {
-                            node->n_tasks = 1; // TODO: this actually is doing nothing
-                                                //       the threads are still spinning
-                            cur = ggml_cl_mul_mat_get_wsize(node->src0, node->src1, node);
-                        }
-                        else
-#endif
-                        if (node->src0->type == GGML_TYPE_F16 && node->src1->type == GGML_TYPE_F32) {
-#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
-                            if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) {
-                                node->n_tasks = 1; // TODO: this actually is doing nothing
-                                                   //       the threads are still spinning
-                                // here we need memory just for single 2D matrix from src0
-                                cur = GGML_TYPE_SIZE[GGML_TYPE_F32]*(node->src0->ne[0]*node->src0->ne[1]);
-                            } else {
-                                cur = GGML_TYPE_SIZE[GGML_TYPE_F16]*ggml_nelements(node->src1);
-                            }
-#else
-                            cur = GGML_TYPE_SIZE[GGML_TYPE_F16]*ggml_nelements(node->src1);
-#endif
-                        } else if (node->src0->type == GGML_TYPE_F32 && node->src1->type == GGML_TYPE_F32) {
-                            cur = 0;
-#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
-                            if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) {
-                                node->n_tasks = 1;
-                            }
-#endif
-                        } else if (ggml_is_quantized(node->src0->type) && node->src1->type == GGML_TYPE_F32) {
-#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
-                            if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) {
-                                node->n_tasks = 1;
-                                cur = GGML_TYPE_SIZE[GGML_TYPE_F32]*(node->src0->ne[0]*node->src0->ne[1]);
-                            } else
-#endif
-                            {
-                                const enum ggml_type type_q = quantize_fns[node->src0->type].vec_dot_type;
-                                cur = GGML_TYPE_SIZE[type_q]*ggml_nelements(node->src1)/GGML_BLCK_SIZE[type_q];
-                            }
-                        } else {
-                            GGML_ASSERT(false);
-                        }
-
-                        work_size = MAX(work_size, cur);
-                    } break;
-                case GGML_OP_SCALE:
-                    {
-                        node->n_tasks = 1;
-                    } break;
-                case GGML_OP_SET:
-                case GGML_OP_CONT:
-                case GGML_OP_RESHAPE:
-                case GGML_OP_VIEW:
-                case GGML_OP_PERMUTE:
-                case GGML_OP_TRANSPOSE:
-                case GGML_OP_GET_ROWS:
-                case GGML_OP_GET_ROWS_BACK:
-                case GGML_OP_DIAG:
-                case GGML_OP_DIAG_MASK_ZERO:
-                    {
-                        node->n_tasks = 1;
-                    } break;
-                case GGML_OP_DIAG_MASK_INF:
-                case GGML_OP_SOFT_MAX:
-                case GGML_OP_SOFT_MAX_BACK:
-                case GGML_OP_ROPE:
-                case GGML_OP_ROPE_BACK:
-                    {
-                        node->n_tasks = n_threads;
-                    } break;
-                case GGML_OP_ALIBI:
-                    {
-                        node->n_tasks = 1; //TODO
-                    } break;
-                case GGML_OP_CLAMP:
-                    {
-                        node->n_tasks = 1; //TODO
-                    } break;
-                case GGML_OP_CONV_1D_S1_PH:
-                case GGML_OP_CONV_1D_S2_PH:
-                    {
-                        node->n_tasks = n_threads;
-
-                        GGML_ASSERT(node->src0->ne[3] == 1);
-                        GGML_ASSERT(node->src1->ne[2] == 1);
-                        GGML_ASSERT(node->src1->ne[3] == 1);
-
-                        size_t cur = 0;
-                        const int nk = node->src0->ne[0];
-
-                        if (node->src0->type == GGML_TYPE_F16 &&
-                            node->src1->type == GGML_TYPE_F32) {
-                            cur = sizeof(ggml_fp16_t)*(
-                                    nk*ggml_up32(node->src0->ne[1])*node->src0->ne[2] +
-                                    ( 2*(nk/2) + node->src1->ne[0])*node->src1->ne[1]
-                                    );
-                        } else if (node->src0->type == GGML_TYPE_F32 &&
-                                   node->src1->type == GGML_TYPE_F32) {
-                            cur = sizeof(float)*(
-                                    nk*ggml_up32(node->src0->ne[1])*node->src0->ne[2] +
-                                    ( 2*(nk/2) + node->src1->ne[0])*node->src1->ne[1]
-                                    );
-                        } else {
-                            GGML_ASSERT(false);
-                        }
-
-                        work_size = MAX(work_size, cur);
-                    } break;
-                case GGML_OP_CONV_2D_SK_P0:
-                    {
-                        node->n_tasks = n_threads;
-
-                        GGML_ASSERT(node->src1->ne[3] == 1);
-
-                        const int64_t ne00 = node->src0->ne[0]; // W
-                        const int64_t ne01 = node->src0->ne[1]; // H
-                        const int64_t ne02 = node->src0->ne[2]; // C
-                        const int64_t ne03 = node->src0->ne[3]; // N
-
-                        const int64_t ne10 = node->src1->ne[0]; // W
-                        const int64_t ne11 = node->src1->ne[1]; // H
-                        const int64_t ne12 = node->src1->ne[2]; // C
-
-                        const int64_t nk = ne00*ne01;
-
-                        UNUSED(ne02);
-                        UNUSED(ne03);
-                        UNUSED(nk);
-
-                        size_t cur = 0;
-
-                        if (node->src0->type == GGML_TYPE_F16 &&
-                            node->src1->type == GGML_TYPE_F32) {
-                            cur = sizeof(ggml_fp16_t)*(ne10*ne11*ne12);
-                        } else if (node->src0->type == GGML_TYPE_F32 &&
-                                   node->src1->type == GGML_TYPE_F32) {
-                            cur = sizeof(float)*      (ne10*ne11*ne12);
-                        } else {
-                            GGML_ASSERT(false);
-                        }
-
-                        work_size = MAX(work_size, cur);
-                    } break;
-                case GGML_OP_FLASH_ATTN:
-                    {
-                        node->n_tasks = n_threads;
-
-                        size_t cur = 0;
-
-                        const int64_t ne11 = ggml_up(node->src1->ne[1], GGML_SOFT_MAX_UNROLL);
-
-                        if (node->src1->type == GGML_TYPE_F32) {
-                            cur  = sizeof(float)*ne11*node->n_tasks; // TODO: this can become (n_tasks-1)
-                            cur += sizeof(float)*ne11*node->n_tasks; // this is overestimated by x2
-                        }
-
-                        if (node->src1->type == GGML_TYPE_F16) {
-                            cur  = sizeof(float)*ne11*node->n_tasks; // TODO: this can become (n_tasks-1)
-                            cur += sizeof(float)*ne11*node->n_tasks; // this is overestimated by x2
-                        }
-
-                        work_size = MAX(work_size, cur);
-                    } break;
-                case GGML_OP_FLASH_FF:
-                    {
-                        node->n_tasks = n_threads;
-
-                        size_t cur = 0;
-
-                        if (node->src1->type == GGML_TYPE_F32) {
-                            cur  = sizeof(float)*node->src1->ne[1]*node->n_tasks; // TODO: this can become (n_tasks-1)
-                            cur += sizeof(float)*node->src1->ne[1]*node->n_tasks; // this is overestimated by x2
-                        }
-
-                        if (node->src1->type == GGML_TYPE_F16) {
-                            cur  = sizeof(float)*node->src1->ne[1]*node->n_tasks; // TODO: this can become (n_tasks-1)
-                            cur += sizeof(float)*node->src1->ne[1]*node->n_tasks; // this is overestimated by x2
-                        }
-
-                        work_size = MAX(work_size, cur);
-                    } break;
-                case GGML_OP_FLASH_ATTN_BACK:
-                    {
-                        node->n_tasks = n_threads;
-
-                        size_t cur = 0;
-
-                        const int64_t    D = node->src0->ne[0];
-                        const int64_t ne11 = ggml_up(node->src1->ne[1], GGML_SOFT_MAX_UNROLL);
-                        const int64_t mxDn = MAX(D, ne11) * 2; // *2 because of S and SM in ggml_compute_forward_flash_attn_back
-                        if (node->src1->type == GGML_TYPE_F32) {
-                            cur  = sizeof(float)*mxDn*node->n_tasks; // TODO: this can become (n_tasks-1)
-                            cur += sizeof(float)*mxDn*node->n_tasks; // this is overestimated by x2
-                        }
-
-                        if (node->src1->type == GGML_TYPE_F16) {
-                            cur  = sizeof(float)*mxDn*node->n_tasks; // TODO: this can become (n_tasks-1)
-                            cur += sizeof(float)*mxDn*node->n_tasks; // this is overestimated by x2
-                        }
-
-                        work_size = MAX(work_size, cur);
-                    } break;
-                case GGML_OP_WIN_PART:
-                case GGML_OP_WIN_UNPART:
-                case GGML_OP_MAP_UNARY:
-                case GGML_OP_MAP_BINARY:
-                case GGML_OP_MAP_CUSTOM1:
-                case GGML_OP_MAP_CUSTOM2:
-                case GGML_OP_MAP_CUSTOM3:
-                    {
-                        node->n_tasks = 1;
-                    } break;
-                case GGML_OP_CROSS_ENTROPY_LOSS:
-                    {
-                        node->n_tasks = n_threads;
-
-                        size_t cur = ggml_type_size(node->type)*(node->n_tasks + node->src0->ne[0]*node->n_tasks);
-
-                        work_size = MAX(work_size, cur);
-                    } break;
-                case GGML_OP_CROSS_ENTROPY_LOSS_BACK:
-                    {
-                        node->n_tasks = n_threads;
-
-                        size_t cur = ggml_type_size(node->type)*node->src0->ne[0]*node->n_tasks;
-
-                        work_size = MAX(work_size, cur);
-                    } break;
-                case GGML_OP_NONE:
-                    {
-                        node->n_tasks = 1;
-                    } break;
-                case GGML_OP_COUNT:
-                    {
-                        GGML_ASSERT(false);
-                    } break;
-            }
-        }
-
-        if (cgraph->work != NULL && work_size > cgraph->work_size) {
-            GGML_ASSERT(false); // TODO: better handling
-        }
-
-        if (work_size > 0 && cgraph->work == NULL) {
-            cgraph->work_size = work_size + CACHE_LINE_SIZE*(n_threads - 1);
-
-            GGML_PRINT_DEBUG("%s: allocating work buffer for graph (%zu bytes)\n", __func__, cgraph->work_size);
-            cgraph->work = ggml_new_tensor_1d(ctx, GGML_TYPE_I8, cgraph->work_size);
-        }
-    }
-
     // create thread pool
     if (n_threads > 1) {
         for (int j = 1; j < n_threads; ++j) {
@@ -17261,12 +16799,12 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
     const int64_t perf_start_time_us = ggml_perf_time_us();
 
     // this is a work thread too
-    ggml_graph_compute_thread(&workers[0]);
+    int compute_status = (size_t) ggml_graph_compute_thread(&workers[0]);
 
     // don't leave affinity set on the main thread
     clear_numa_thread_affinity();
 
-    // join thread pool
+    // join or kill thread pool
     if (n_threads > 1) {
         for (int j = 1; j < n_threads; j++) {
             const int rc = ggml_thread_join(workers[j].thrd, NULL);
@@ -17290,6 +16828,8 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
                 (double) perf_time_us_cur     / 1000.0,
                 (double) cgraph->perf_time_us / 1000.0 / cgraph->perf_runs);
     }
+
+    return compute_status;
 }
 
 void ggml_graph_reset(struct ggml_cgraph * cgraph) {
@@ -17302,6 +16842,17 @@ void ggml_graph_reset(struct ggml_cgraph * cgraph) {
     }
 }
 
+void ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads) {
+    struct ggml_cplan cplan = ggml_graph_plan(cgraph, n_threads);
+
+    struct ggml_tensor * buf = ggml_new_tensor_1d(ctx, GGML_TYPE_I8, cplan.work_size);
+    GGML_ASSERT(buf);
+
+    cplan.work_data = buf->data;
+
+    ggml_graph_compute(cgraph, &cplan);
+}
+
 struct ggml_tensor * ggml_graph_get_tensor(struct ggml_cgraph * cgraph, const char * name) {
     for (int i = 0; i < cgraph->n_leafs; i++) {
         struct ggml_tensor * leaf = cgraph->leafs[i];
@@ -17340,22 +16891,18 @@ static void ggml_graph_export_node(const struct ggml_tensor * tensor, const char
     const int64_t * ne = tensor->ne;
     const size_t  * nb = tensor->nb;
 
-    fprintf(fout, "%-6s %-6s %-12s %8d %" PRId64 " %" PRId64 " %" PRId64 " %" PRId64 " %16zu %16zu %16zu %16zu %8d %16p %32s\n",
+    fprintf(fout, "%-6s %-6s %-12s %8d %" PRId64 " %" PRId64 " %" PRId64 " %" PRId64 " %16zu %16zu %16zu %16zu %16p %32s\n",
             arg,
             ggml_type_name(tensor->type),
             ggml_op_name  (tensor->op),
             tensor->n_dims,
             ne[0], ne[1], ne[2], ne[3],
             nb[0], nb[1], nb[2], nb[3],
-            tensor->n_tasks,
             tensor->data,
             tensor->name);
 }
 
 void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) {
-    //assert(cgraph->work      == NULL);
-    //assert(cgraph->work_size == 0);
-
     uint64_t size_eval = 0;
 
     // compute size of intermediate results
@@ -17384,8 +16931,8 @@ void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) {
             ggml_graph_export_leaf(cgraph->leafs[i], fout);
 
             GGML_ASSERT(cgraph->leafs[i]->op   == GGML_OP_NONE);
-            GGML_ASSERT(cgraph->leafs[i]->src0 == NULL);
-            GGML_ASSERT(cgraph->leafs[i]->src1 == NULL);
+            GGML_ASSERT(cgraph->leafs[i]->src[0] == NULL);
+            GGML_ASSERT(cgraph->leafs[i]->src[1] == NULL);
         }
 
         // header
@@ -17396,17 +16943,9 @@ void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) {
         for (int i = 0; i < cgraph->n_nodes; ++i) {
             ggml_graph_export_node(cgraph->nodes[i], "DST", fout);
 
-            if (cgraph->nodes[i]->src0) {
-                ggml_graph_export_node(cgraph->nodes[i]->src0, "SRC0", fout);
-            }
-
-            if (cgraph->nodes[i]->src1) {
-                ggml_graph_export_node(cgraph->nodes[i]->src1, "SRC1", fout);
-            }
-
-            for (int j = 0; j < GGML_MAX_OPT; ++j) {
-                if (cgraph->nodes[i]->opt[j]) {
-                    ggml_graph_export_node(cgraph->nodes[i]->opt[j], "OPT", fout);
+            for (int j = 0; j < GGML_MAX_SRC; ++j) {
+                if (cgraph->nodes[i]->src[j]) {
+                    ggml_graph_export_node(cgraph->nodes[i]->src[j], "SRC", fout);
                 }
             }
 
@@ -17460,13 +16999,6 @@ void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) {
                     fwrite(&nb, sizeof(uint64_t), 1, fout);
                 }
 
-                // store the pointer address
-                {
-                    const uint64_t ptr = (uint64_t) tensor->data;
-
-                    fwrite(&ptr, sizeof(uint64_t), 1, fout);
-                }
-
                 fwrite(tensor->name, sizeof(char), GGML_MAX_NAME, fout);
 
                 // dump the data
@@ -17500,27 +17032,17 @@ void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) {
                     fwrite(&nb, sizeof(uint64_t), 1, fout);
                 }
 
-                // store the pointer address
-                {
-                    const uint64_t ptr = (uint64_t) tensor->data;
-
-                    fwrite(&ptr, sizeof(uint64_t), 1, fout);
-                }
-
                 fwrite(tensor->name, sizeof(char), GGML_MAX_NAME, fout);
 
                 // output the op arguments
                 {
-                    struct ggml_tensor * args[2 + GGML_MAX_OPT] = { NULL };
+                    struct ggml_tensor * args[GGML_MAX_SRC] = { NULL };
 
-                    args[0] = tensor->src0;
-                    args[1] = tensor->src1;
-
-                    for (int j = 0; j < GGML_MAX_OPT; ++j) {
-                        args[2 + j] = tensor->opt[j];
+                    for (int j = 0; j < GGML_MAX_SRC; ++j) {
+                        args[j] = tensor->src[j];
                     }
 
-                    for (int j = 0; j < 2 + GGML_MAX_OPT; ++j) {
+                    for (int j = 0; j < GGML_MAX_SRC; ++j) {
                         if (args[j]) {
                             int32_t idx = -1;
 
@@ -17691,8 +17213,6 @@ struct ggml_cgraph ggml_graph_import(const char * fname, struct ggml_context **
 
                 tensor->op = (enum ggml_op) op;
 
-                uint64_t ptr_cur = *(const uint64_t *) ptr; ptr += sizeof(ptr_cur);
-
                 memcpy(tensor->name, ptr, GGML_MAX_NAME); ptr += GGML_MAX_NAME;
 
                 tensor->data = (void *) ptr;
@@ -17738,16 +17258,14 @@ struct ggml_cgraph ggml_graph_import(const char * fname, struct ggml_context **
                     nb[j] = nb_cur;
                 }
 
-                uint64_t ptr_cur = *(const uint64_t *) ptr; ptr += sizeof(ptr_cur); // TODO: not yet used
-
                 const char * ptr_name = ptr; ptr += GGML_MAX_NAME;
 
-                const int32_t * ptr_arg_idx = (const int32_t *) ptr; ptr += (2 + GGML_MAX_OPT)*sizeof(int32_t);
+                const int32_t * ptr_arg_idx = (const int32_t *) ptr; ptr += GGML_MAX_SRC*sizeof(int32_t);
 
-                struct ggml_tensor * args[2 + GGML_MAX_OPT] = { NULL };
+                struct ggml_tensor * args[GGML_MAX_SRC] = { NULL };
 
                 // parse args
-                for (int j = 0; j < 2 + GGML_MAX_OPT; ++j) {
+                for (int j = 0; j < GGML_MAX_SRC; ++j) {
                     const int32_t arg_idx = ptr_arg_idx[j];
 
                     if (arg_idx == -1) {
@@ -17804,11 +17322,8 @@ struct ggml_cgraph ggml_graph_import(const char * fname, struct ggml_context **
                     tensor->nb[j] = nb[j];
                 }
 
-                tensor->src0 = args[0];
-                tensor->src1 = args[1];
-
-                for (int j = 0; j < GGML_MAX_OPT; ++j) {
-                    tensor->opt[j] = args[2 + j];
+                for (int j = 0; j < GGML_MAX_SRC; ++j) {
+                    tensor->src[j] = args[j];
                 }
 
                 result.nodes[i] = tensor;
@@ -17826,9 +17341,6 @@ void ggml_graph_print(const struct ggml_cgraph * cgraph) {
 
     GGML_PRINT("=== GRAPH ===\n");
 
-    GGML_PRINT_DEBUG("n_threads       = %d\n",        cgraph->n_threads);
-    GGML_PRINT_DEBUG("total work size = %zu bytes\n", cgraph->work_size);
-
     GGML_PRINT("n_nodes = %d\n", cgraph->n_nodes);
     for (int i = 0; i < cgraph->n_nodes; i++) {
         struct ggml_tensor * node = cgraph->nodes[i];
@@ -18007,19 +17519,11 @@ void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph
     for (int i = 0; i < gb->n_nodes; i++) {
         struct ggml_tensor * node = gb->nodes[i];
 
-        if (node->src0) {
-            ggml_graph_dump_dot_node_edge(fp, gb, node, node->src0, "x");
-        }
-
-        if (node->src1) {
-            ggml_graph_dump_dot_node_edge(fp, gb, node, node->src1, "y");
-        }
-
-        for (int j = 0; j < GGML_MAX_OPT; j++) {
-            if (node->opt[j]) {
+        for (int j = 0; j < GGML_MAX_SRC; j++) {
+            if (node->src[j]) {
                 char label[16];
-                snprintf(label, sizeof(label), "opt %d", j);
-                ggml_graph_dump_dot_node_edge(fp, gb, node, node->opt[j], label);
+                snprintf(label, sizeof(label), "src %d", j);
+                ggml_graph_dump_dot_node_edge(fp, gb, node, node->src[j], label);
             }
         }
     }
@@ -18027,19 +17531,11 @@ void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph
     for (int i = 0; i < gb->n_leafs; i++) {
         struct ggml_tensor * node = gb->leafs[i];
 
-        if (node->src0) {
-            ggml_graph_dump_dot_leaf_edge(fp, node, node->src0, "x");
-        }
-
-        if (node->src1) {
-            ggml_graph_dump_dot_leaf_edge(fp, node, node->src1, "y");
-        }
-
-        for (int j = 0; j < GGML_MAX_OPT; j++) {
-            if (node->opt[j]) {
+        for (int j = 0; j < GGML_MAX_SRC; j++) {
+            if (node->src[j]) {
                 char label[16];
-                snprintf(label, sizeof(label), "opt %d", j);
-                ggml_graph_dump_dot_leaf_edge(fp, node, node->opt[j], label);
+                snprintf(label, sizeof(label), "src %d", j);
+                ggml_graph_dump_dot_leaf_edge(fp, node, node->src[j], label);
             }
         }
     }
@@ -18101,9 +17597,6 @@ static enum ggml_opt_result ggml_opt_adam(
         struct ggml_cgraph * gb) {
     GGML_ASSERT(ggml_is_scalar(f));
 
-    gf->n_threads = params.n_threads;
-    gb->n_threads = params.n_threads;
-
     // these will store the parameters we want to optimize
     struct ggml_tensor * ps[GGML_MAX_PARAMS];
 
@@ -18150,7 +17643,8 @@ static enum ggml_opt_result ggml_opt_adam(
     // compute the function value
     ggml_graph_reset  (gf);
     ggml_set_f32      (f->grad, 1.0f);
-    ggml_graph_compute(ctx, gb);
+
+    ggml_graph_compute_with_ctx(ctx, gb, params.n_threads);
 
     opt->adam.fx_prev = ggml_get_f32_1d(f, 0);
     opt->adam.fx_best = opt->adam.fx_prev;
@@ -18230,7 +17724,8 @@ static enum ggml_opt_result ggml_opt_adam(
 
         ggml_graph_reset  (gf);
         ggml_set_f32      (f->grad, 1.0f);
-        ggml_graph_compute(ctx, gb);
+
+        ggml_graph_compute_with_ctx(ctx, gb, params.n_threads);
 
         const float fx = ggml_get_f32_1d(f, 0);
 
@@ -18352,7 +17847,8 @@ static enum ggml_opt_result linesearch_backtracking(
 
             ggml_graph_reset  (gf);
             ggml_set_f32      (f->grad, 1.0f);
-            ggml_graph_compute(ctx, gb);
+
+            ggml_graph_compute_with_ctx(ctx, gb, params->n_threads);
 
             ggml_opt_get_grad(np, ps, g);
 
@@ -18420,9 +17916,6 @@ static enum ggml_opt_result ggml_opt_lbfgs(
         }
     }
 
-    gf->n_threads = params.n_threads;
-    gb->n_threads = params.n_threads;
-
     const int m = params.lbfgs.m;
 
     // these will store the parameters we want to optimize
@@ -18474,7 +17967,8 @@ static enum ggml_opt_result ggml_opt_lbfgs(
 
         ggml_graph_reset  (gf);
         ggml_set_f32      (f->grad, 1.0f);
-        ggml_graph_compute(ctx, gb);
+
+        ggml_graph_compute_with_ctx(ctx, gb, params.n_threads);
 
         ggml_opt_get_grad(np, ps, g);
 
diff --git a/ggml.h b/ggml.h
index 459913222..24856a255 100644
--- a/ggml.h
+++ b/ggml.h
@@ -65,7 +65,7 @@
 //       ggml_set_f32(a, 3.0f);
 //       ggml_set_f32(b, 4.0f);
 //
-//       ggml_graph_compute(ctx0, &gf);
+//       ggml_graph_compute_with_ctx(ctx, &gf, n_threads);
 //
 //       printf("f = %f\n", ggml_get_f32_1d(f, 0));
 //
@@ -132,10 +132,10 @@
 //   {
 //       struct ggml_tensor * a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 2, 3);
 //
-//       // a[1, 2] = 1.0f;
+//       // a[2, 1] = 1.0f;
 //       *(float *) ((char *) a->data + 2*a->nb[1] + 1*a->nb[0]) = 1.0f;
 //
-//       // a[2, 0] = 2.0f;
+//       // a[0, 2] = 2.0f;
 //       *(float *) ((char *) a->data + 0*a->nb[1] + 2*a->nb[0]) = 2.0f;
 //
 //       ...
@@ -197,10 +197,17 @@
 #define GGML_MAX_NODES         4096
 #define GGML_MAX_PARAMS        256
 #define GGML_MAX_CONTEXTS      64
-#define GGML_MAX_OPT           4
+#define GGML_MAX_SRC           6
 #define GGML_MAX_NAME          48
 #define GGML_DEFAULT_N_THREADS 4
 
+
+#define GGML_EXIT_SUCCESS 0
+#define GGML_EXIT_ABORTED 1
+
+#define GGML_UNUSED(x) (void)(x)
+
+
 #define GGML_ASSERT(x) \
     do { \
         if (!(x)) { \
@@ -209,6 +216,30 @@
         } \
     } while (0)
 
+// used to copy the number of elements and stride in bytes of tensors into local variables.
+// main purpose is to reduce code duplication and improve readability.
+//
+// example:
+//
+//    GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne);
+//    GGML_TENSOR_LOCALS(size_t,  nb1, src1, nb);
+//
+#define GGML_TENSOR_LOCALS_1(type, prefix, pointer, array) \
+    const type prefix##0 = (pointer)->array[0]; \
+    GGML_UNUSED(prefix##0);
+#define GGML_TENSOR_LOCALS_2(type, prefix, pointer, array) \
+    GGML_TENSOR_LOCALS_1    (type, prefix, pointer, array) \
+    const type prefix##1 = (pointer)->array[1]; \
+    GGML_UNUSED(prefix##1);
+#define GGML_TENSOR_LOCALS_3(type, prefix, pointer, array) \
+    GGML_TENSOR_LOCALS_2    (type, prefix, pointer, array) \
+    const type prefix##2 = (pointer)->array[2]; \
+    GGML_UNUSED(prefix##2);
+#define GGML_TENSOR_LOCALS(type, prefix, pointer, array) \
+    GGML_TENSOR_LOCALS_3  (type, prefix, pointer, array) \
+    const type prefix##3 = (pointer)->array[3]; \
+    GGML_UNUSED(prefix##3);
+
 #ifdef  __cplusplus
 extern "C" {
 #endif
@@ -224,8 +255,8 @@ extern "C" {
     GGML_API float       ggml_fp16_to_fp32(ggml_fp16_t x);
     GGML_API ggml_fp16_t ggml_fp32_to_fp16(float x);
 
-    GGML_API void ggml_fp16_to_fp32_row(const ggml_fp16_t * x, float * y, size_t n);
-    GGML_API void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, size_t n);
+    GGML_API void ggml_fp16_to_fp32_row(const ggml_fp16_t * x, float * y, int n);
+    GGML_API void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, int n);
 
     struct ggml_object;
     struct ggml_context;
@@ -295,12 +326,15 @@ extern "C" {
         GGML_OP_SUM,
         GGML_OP_SUM_ROWS,
         GGML_OP_MEAN,
+        GGML_OP_ARGMAX,
         GGML_OP_REPEAT,
         GGML_OP_REPEAT_BACK,
         GGML_OP_ABS,
         GGML_OP_SGN,
         GGML_OP_NEG,
         GGML_OP_STEP,
+        GGML_OP_TANH,
+        GGML_OP_ELU,
         GGML_OP_RELU,
         GGML_OP_GELU,
         GGML_OP_GELU_QUICK,
@@ -332,9 +366,10 @@ extern "C" {
         GGML_OP_ROPE_BACK,
         GGML_OP_ALIBI,
         GGML_OP_CLAMP,
-        GGML_OP_CONV_1D_S1_PH,
-        GGML_OP_CONV_1D_S2_PH,
-        GGML_OP_CONV_2D_SK_P0,
+        GGML_OP_CONV_1D,
+        GGML_OP_CONV_2D,
+        GGML_OP_POOL_1D,
+        GGML_OP_POOL_2D,
 
         GGML_OP_FLASH_ATTN,
         GGML_OP_FLASH_FF,
@@ -386,12 +421,7 @@ extern "C" {
         bool is_param;
 
         struct ggml_tensor * grad;
-        struct ggml_tensor * src0;
-        struct ggml_tensor * src1;
-        struct ggml_tensor * opt[GGML_MAX_OPT];
-
-        // thread scheduling
-        int n_tasks;
+        struct ggml_tensor * src[GGML_MAX_SRC];
 
         // performance
         int     perf_runs;
@@ -404,19 +434,31 @@ extern "C" {
 
         void * extra; // extra things e.g. for ggml-cuda.cu
 
-        char padding[4];
+        char padding[8];
     };
 
     static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);
 
+    // the compute plan that needs to be prepared for ggml_graph_compute()
+    // since https://github.com/ggerganov/ggml/issues/287
+    struct ggml_cplan {
+        size_t    work_size; // size of work buffer, calculated by `ggml_graph_plan()`
+        uint8_t * work_data; // work buffer, to be allocated by caller before calling to `ggml_graph_compute()`
+
+        int n_threads;
+
+        // the `n_tasks` of nodes, 1:1 mapping to cgraph nodes
+        int n_tasks[GGML_MAX_NODES];
+
+        // abort ggml_graph_compute when true
+        bool (*abort_callback)(void * data);
+        void * abort_callback_data;
+    };
+
     // computation graph
     struct ggml_cgraph {
         int n_nodes;
         int n_leafs;
-        int n_threads;
-
-        size_t work_size;
-        struct ggml_tensor * work;
 
         struct ggml_tensor * nodes[GGML_MAX_NODES];
         struct ggml_tensor * grads[GGML_MAX_NODES];
@@ -444,6 +486,9 @@ extern "C" {
 
 
     // compute types
+
+    // NOTE: the INIT or FINALIZE pass is not scheduled unless explicitly enabled.
+    // This behavior was changed since https://github.com/ggerganov/llama.cpp/pull/1995.
     enum ggml_task_type {
         GGML_TASK_INIT = 0,
         GGML_TASK_COMPUTE,
@@ -687,6 +732,11 @@ extern "C" {
             struct ggml_context * ctx,
             struct ggml_tensor  * a);
 
+    // argmax along rows
+    GGML_API struct ggml_tensor * ggml_argmax(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a);
+
     // if a is the same shape as b, and a is not parameter, return a
     // otherwise, return a new tensor: repeat(a) to fit in b
     GGML_API struct ggml_tensor * ggml_repeat(
@@ -731,6 +781,22 @@ extern "C" {
             struct ggml_context * ctx,
             struct ggml_tensor  * a);
 
+    GGML_API struct ggml_tensor * ggml_tanh(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a);
+
+    GGML_API struct ggml_tensor * ggml_tanh_inplace(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a);
+
+    GGML_API struct ggml_tensor * ggml_elu(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a);
+
+    GGML_API struct ggml_tensor * ggml_elu_inplace(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a);
+
     GGML_API struct ggml_tensor * ggml_relu(
             struct ggml_context * ctx,
             struct ggml_tensor  * a);
@@ -1055,6 +1121,17 @@ extern "C" {
             int                   mode,
             int                   n_ctx);
 
+    // custom RoPE, in-place, returns view(a)
+    GGML_API struct ggml_tensor * ggml_rope_custom_inplace(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            int                   n_past,
+            int                   n_dims,
+            int                   mode,
+            float                 freq_base,
+            float                 freq_scale,
+            int                   n_ctx);
+
     // rotary position embedding backward, i.e compute dx from dy
     // a - dy
     GGML_API struct ggml_tensor * ggml_rope_back(
@@ -1081,58 +1158,58 @@ extern "C" {
             float                 min,
             float                 max);
 
-    // TODO: implement general-purpose convolutions
-    // GGML_API struct ggml_tensor * ggml_conv_1d(
-    //        struct ggml_context * ctx,
-    //        struct ggml_tensor  * a,
-    //        struct ggml_tensor  * b,
-    //        int                   s0
-    //        int                   p0,
-    //        int                   d0);
-    //
-    // GGML_API struct ggml_tensor * ggml_conv_2d(
-    //        struct ggml_context * ctx,
-    //        struct ggml_tensor  * a,
-    //        struct ggml_tensor  * b,
-    //        int                   s0,
-    //        int                   s1,
-    //        int                   p0,
-    //        int                   p1,
-    //        int                   d0,
-    //        int                   d1);
-
-    // padding = half
-    // TODO: we don't support extra parameters for now
-    //       that's why we are hard-coding the stride, padding, and dilation
-    //       not great ..
-    // example:
-    // a:      3   80  768    1
-    // b:   3000   80    1    1
-    // res: 3000  768    1    1
-    // used in whisper
-    GGML_API struct ggml_tensor * ggml_conv_1d_s1_ph(
+    GGML_API struct ggml_tensor * ggml_conv_1d(
             struct ggml_context * ctx,
             struct ggml_tensor  * a,
-            struct ggml_tensor  * b);
+            struct ggml_tensor  * b,
+            int                   s0,  // stride
+            int                   p0,  // padding
+            int                   d0); // dilation
 
-    // used in whisper
-    GGML_API struct ggml_tensor * ggml_conv_1d_s2_ph(
+    GGML_API struct ggml_tensor * ggml_conv_2d(
             struct ggml_context * ctx,
             struct ggml_tensor  * a,
-            struct ggml_tensor  * b);
+            struct ggml_tensor  * b,
+            int                   s0,
+            int                   s1,
+            int                   p0,
+            int                   p1,
+            int                   d0,
+            int                   d1);
 
-    // kernel size is a->ne[0] x a->ne[1]
-    // stride is equal to kernel size
-    // padding is zero
-    // example:
-    // a:     16   16    3  768
-    // b:   1024 1024    3    1
-    // res:   64   64  768    1
-    // used in sam
-    GGML_API struct ggml_tensor * ggml_conv_2d_sk_p0(
+    // conv_1d with padding = half
+    // alias for ggml_conv_1d(a, b, s, a->ne[0]/2, d)
+    GGML_API struct ggml_tensor* ggml_conv_1d_ph(
             struct ggml_context * ctx,
             struct ggml_tensor  * a,
-            struct ggml_tensor  * b);
+            struct ggml_tensor  * b,
+            int                   s,
+            int                   d);
+
+    enum ggml_op_pool {
+        GGML_OP_POOL_MAX,
+        GGML_OP_POOL_AVG,
+        GGML_OP_POOL_COUNT,
+    };
+
+    GGML_API struct ggml_tensor* ggml_pool_1d(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            enum ggml_op_pool     op,
+            int                   k0, // kernel size
+            int                   s0, // stride
+            int                   p0); // padding
+
+    GGML_API struct ggml_tensor* ggml_pool_2d(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            enum ggml_op_pool     op,
+            int                   k0,
+            int                   k1,
+            int                   s0,
+            int                   s1,
+            int                   p0,
+            int                   p1);
 
     GGML_API struct ggml_tensor * ggml_flash_attn(
             struct ggml_context * ctx,
@@ -1263,15 +1340,22 @@ extern "C" {
 
     GGML_API void ggml_set_param(
             struct ggml_context * ctx,
-            struct ggml_tensor * tensor);
+            struct ggml_tensor  * tensor);
 
     GGML_API void ggml_build_forward_expand(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor);
 
     GGML_API struct ggml_cgraph ggml_build_forward (struct ggml_tensor * tensor);
     GGML_API struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cgraph * gf, bool keep);
 
-    GGML_API void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph);
-    GGML_API void ggml_graph_reset  (struct ggml_cgraph * cgraph);
+    // ggml_graph_plan() has to be called before ggml_graph_compute()
+    // when plan.work_size > 0, caller must allocate memory for plan.work_data
+    GGML_API struct ggml_cplan ggml_graph_plan   (struct ggml_cgraph * cgraph, int n_threads /*= GGML_DEFAULT_N_THREADS*/);
+    GGML_API               int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan);
+    GGML_API              void ggml_graph_reset  (struct ggml_cgraph * cgraph);
+
+    // same as ggml_graph_compute() but the work data is allocated as a part of the context
+    // note: the drawback of this API is that you must have ensured that the context has enough memory for the work data
+    GGML_API void ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads);
 
     GGML_API struct ggml_tensor * ggml_graph_get_tensor(struct ggml_cgraph * cgraph, const char * name);
 
@@ -1488,25 +1572,24 @@ extern "C" {
     //
 
 #ifdef  __cplusplus
-    // restrict not standard in C++
+// restrict not standard in C++
 #define GGML_RESTRICT
 #else
 #define GGML_RESTRICT restrict
 #endif
-    typedef void (*dequantize_row_q_t)(const void * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
-    typedef void (*quantize_row_q_t)  (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
-    typedef void (*vec_dot_q_t)       (const int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT x, const void * GGML_RESTRICT y);
+    typedef void (*ggml_to_float_t)  (const void  * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
+    typedef void (*ggml_from_float_t)(const float * GGML_RESTRICT x, void  * GGML_RESTRICT y, int k);
+    typedef void (*ggml_vec_dot_t)   (const int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT x, const void * GGML_RESTRICT y);
 
     typedef struct {
-        dequantize_row_q_t dequantize_row_q;
-        quantize_row_q_t   quantize_row_q;
-        quantize_row_q_t   quantize_row_q_reference;
-        quantize_row_q_t   quantize_row_q_dot;
-        vec_dot_q_t        vec_dot_q;
-        enum ggml_type     vec_dot_type;
-    } quantize_fns_t;
+        ggml_to_float_t   to_float;
+        ggml_from_float_t from_float;
+        ggml_from_float_t from_float_reference;
+        ggml_vec_dot_t    vec_dot;
+        enum ggml_type    vec_dot_type;
+    } ggml_type_traits_t;
 
-    quantize_fns_t ggml_internal_get_quantize_fn(size_t i);
+    ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type i);
 
 #ifdef  __cplusplus
 }
diff --git a/k_quants.h b/k_quants.h
index 6abe3d7b8..adc6a3913 100644
--- a/k_quants.h
+++ b/k_quants.h
@@ -15,6 +15,14 @@
 #define K_SCALE_SIZE 12
 #endif
 
+#ifndef static_assert
+#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201100L)
+#define static_assert(cond, msg) _Static_assert(cond, msg)
+#else
+#define static_assert(cond, msg) struct global_scope_noop_trick
+#endif
+#endif
+
 //
 // Super-block quantization structures
 //
diff --git a/llama.cpp b/llama.cpp
index 049f73e44..0f9d5346d 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -19,6 +19,9 @@
 #ifdef GGML_USE_METAL
 #include "ggml-metal.h"
 #endif
+#ifdef GGML_USE_MPI
+#include "ggml-mpi.h"
+#endif
 #ifdef GGML_USE_K_QUANTS
 #ifndef QK_K
 #ifdef GGML_QKK_64
@@ -66,6 +69,7 @@ enum e_model {
     MODEL_65B,
 };
 
+static const size_t kB = 1024;
 static const size_t MB = 1024*1024;
 
 // computed for n_ctx == 2048
@@ -78,14 +82,34 @@ void llama_nop(struct ggml_tensor * tensor) { // don't offload by default
     (void) tensor;
 }
 
-static const std::map<e_model, size_t> & MEM_REQ_SCRATCH0()
+//
+// ggml helpers
+//
+
+static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) {
+    struct ggml_cplan plan = ggml_graph_plan(graph, n_threads);
+
+    if (plan.work_size > 0) {
+        buf.resize(plan.work_size);
+        plan.work_data = buf.data();
+    }
+
+    ggml_graph_compute(graph, &plan);
+}
+
+//
+// memory sizes
+//
+
+static const std::map<e_model, size_t> & MEM_REQ_SCRATCH0(int n_ctx)
 {
     static std::map<e_model, size_t> k_sizes = {
-        { MODEL_3B,    256ull * MB },
-        { MODEL_7B,    512ull * MB },
-        { MODEL_13B,   512ull * MB },
-        { MODEL_30B,   512ull * MB },
-        { MODEL_65B,  1024ull * MB },
+        /* empirical scaling, still a guess */
+        { MODEL_3B,   ((size_t) n_ctx / 16ull + 128ull) * MB },
+        { MODEL_7B,   ((size_t) n_ctx / 16ull + 256ull) * MB },
+        { MODEL_13B,  ((size_t) n_ctx / 12ull + 256ull) * MB },
+        { MODEL_30B,  ((size_t) n_ctx / 10ull + 256ull) * MB },
+        { MODEL_65B,  ((size_t) n_ctx /  8ull + 512ull) * MB },
     };
     return k_sizes;
 }
@@ -117,14 +141,42 @@ static const std::map<e_model, size_t> & MEM_REQ_KV_SELF()
 
 // this is mostly needed for temporary mul_mat buffers to dequantize the data
 // not actually needed if BLAS is disabled
-static const std::map<e_model, size_t> & MEM_REQ_EVAL()
+static const std::map<e_model, size_t> & MEM_REQ_EVAL(int n_ctx)
 {
     static std::map<e_model, size_t> k_sizes = {
-        { MODEL_3B,   512ull * MB },
-        { MODEL_7B,   768ull * MB },
-        { MODEL_13B, 1024ull * MB },
-        { MODEL_30B, 1280ull * MB },
-        { MODEL_65B, 1536ull * MB },
+        { MODEL_3B,  ((size_t) n_ctx / 256ull +  512ull) * MB },
+        { MODEL_7B,  ((size_t) n_ctx / 256ull +  768ull) * MB },
+        { MODEL_13B, ((size_t) n_ctx / 256ull + 1024ull) * MB },
+        { MODEL_30B, ((size_t) n_ctx / 256ull + 1280ull) * MB },
+        { MODEL_65B, ((size_t) n_ctx / 256ull + 1536ull) * MB },
+    };
+    return k_sizes;
+}
+
+// amount of VRAM needed per batch size to hold temporary results
+// the values for 3b and 65b are not derived from testing but instead chosen conservatively
+static const std::map<e_model, size_t> & VRAM_REQ_SCRATCH_BASE()
+{
+    static std::map<e_model, size_t> k_sizes = {
+        { MODEL_3B,   512ull * kB },
+        { MODEL_7B,   512ull * kB },
+        { MODEL_13B,  640ull * kB },
+        { MODEL_30B,  768ull * kB },
+        { MODEL_65B, 1536ull * kB },
+    };
+    return k_sizes;
+}
+
+// amount of VRAM needed per batch size and context to hold temporary results
+// the values for 3b and 65b are not derived from testing but instead chosen conservatively
+static const std::map<e_model, size_t> & VRAM_REQ_SCRATCH_PER_CONTEXT()
+{
+    static std::map<e_model, size_t> k_sizes = {
+        { MODEL_3B,  128ull },
+        { MODEL_7B,  128ull },
+        { MODEL_13B, 160ull },
+        { MODEL_30B, 208ull },
+        { MODEL_65B, 416ull },
     };
     return k_sizes;
 }
@@ -138,6 +190,10 @@ struct llama_hparams {
     uint32_t n_head  = 32;
     uint32_t n_layer = 32;
     uint32_t n_rot   = 64;
+
+    float rope_freq_base  = 10000.0f;
+    float rope_freq_scale = 1.0f;
+
     enum llama_ftype ftype = LLAMA_FTYPE_MOSTLY_F16;
 
     bool operator!=(const llama_hparams & other) const {
@@ -165,8 +221,8 @@ struct llama_layer {
 };
 
 struct llama_kv_cache {
-    struct ggml_tensor * k;
-    struct ggml_tensor * v;
+    struct ggml_tensor * k = NULL;
+    struct ggml_tensor * v = NULL;
 
     struct ggml_context * ctx = NULL;
 
@@ -252,8 +308,14 @@ struct llama_model {
 };
 
 struct llama_context {
-    llama_context(const llama_model & model, const llama_vocab & vocab) : model(model), vocab(vocab), t_load_us(model.t_load_us), t_start_us(model.t_start_us) {}
-
+    llama_context(const llama_model & model) : model(model), t_load_us(model.t_load_us), t_start_us(model.t_start_us) {}
+#ifdef GGML_USE_METAL
+    ~llama_context() {
+        if (ctx_metal) {
+            ggml_metal_free(ctx_metal);
+        }
+    }
+#endif
     std::mt19937 rng;
 
     bool has_evaluated_once = false;
@@ -267,7 +329,6 @@ struct llama_context {
     int32_t n_p_eval = 0; // number of tokens in eval calls for the prompt (with batch size > 1)
 
     const llama_model & model;
-    const llama_vocab & vocab;
 
     bool model_owner = false;
 
@@ -286,6 +347,9 @@ struct llama_context {
     // input embedding (1-dimensional array: [n_embd])
     std::vector<float> embedding;
 
+    // reusable buffer for `struct ggml_graph_plan.work_data`
+    std::vector<uint8_t> work_buffer;
+
     // memory buffers used to evaluate the model
     // TODO: move in llama_state
     llama_ctx_buffer buf_compute;
@@ -295,6 +359,10 @@ struct llama_context {
     ggml_metal_context * ctx_metal = NULL;
 #endif
 
+#ifdef GGML_USE_MPI
+    ggml_mpi_context * ctx_mpi = NULL;
+#endif
+
     int    buf_last = 0;
     size_t buf_max_size[LLAMA_MAX_SCRATCH_BUFFERS] = { 0 };
 
@@ -446,9 +514,7 @@ struct llama_file_loader {
             std::string word = file.read_string(len);
 
             float score = 0.0f;
-            if (file_version >= LLAMA_FILE_VERSION_GGMF_V1) {
-                file.read_raw(&score, sizeof(score));
-            }
+            file.read_raw(&score, sizeof(score));
 
             vocab.token_to_id[word] = i;
 
@@ -586,7 +652,7 @@ struct llama_model_loader {
         *ctx_size_p = *mmapped_size_p = 0;
         for (const llama_load_tensor & lt : tensors_map.tensors) {
             *ctx_size_p += sizeof(struct ggml_tensor) + GGML_OBJECT_SIZE;
-            *(use_mmap ? mmapped_size_p : ctx_size_p) += lt.size;
+            *(use_mmap ? mmapped_size_p : ctx_size_p) += lt.size + 16;
         }
     }
 
@@ -725,7 +791,6 @@ struct llama_model_loader {
 
 };
 
-
 //
 // kv cache
 //
@@ -783,6 +848,8 @@ struct llama_context_params llama_context_default_params() {
         /*.gpu_layers                  =*/ 0,
         /*.main_gpu                    =*/ 0,
         /*.tensor_split                =*/ {0},
+        /*.rope_freq_base              =*/ 10000.0f,
+        /*.rope_freq_scale             =*/ 1.0f,
         /*.progress_callback           =*/ nullptr,
         /*.progress_callback_user_data =*/ nullptr,
         /*.low_vram                    =*/ false,
@@ -816,7 +883,7 @@ bool llama_mlock_supported() {
     return llama_mlock::SUPPORTED;
 }
 
-void llama_init_backend(bool numa) {
+void llama_backend_init(bool numa) {
     ggml_time_init();
 
     // needed to initialize f16 tables
@@ -829,6 +896,16 @@ void llama_init_backend(bool numa) {
     if (numa) {
         ggml_numa_init();
     }
+
+#ifdef GGML_USE_MPI
+    ggml_mpi_backend_init();
+#endif
+}
+
+void llama_backend_free() {
+#ifdef GGML_USE_MPI
+    ggml_mpi_backend_free();
+#endif
 }
 
 int64_t llama_time_us() {
@@ -896,6 +973,8 @@ static void llama_model_load_internal(
         int n_gpu_layers,
         int main_gpu,
         const float * tensor_split,
+        float rope_freq_base,
+        float rope_freq_scale,
         bool low_vram,
         ggml_type memory_type,
         bool use_mmap,
@@ -930,22 +1009,27 @@ static void llama_model_load_internal(
         }
 
         hparams.n_ctx = n_ctx;
+
+        hparams.rope_freq_base  = rope_freq_base;
+        hparams.rope_freq_scale = rope_freq_scale;
     }
 
     const uint32_t n_ff = ((2*(4*hparams.n_embd)/3 + hparams.n_mult - 1)/hparams.n_mult)*hparams.n_mult;
 
     {
-        fprintf(stderr, "%s: format     = %s\n",  __func__, llama_file_version_name(file_version));
-        fprintf(stderr, "%s: n_vocab    = %u\n",  __func__, hparams.n_vocab);
-        fprintf(stderr, "%s: n_ctx      = %u\n",  __func__, hparams.n_ctx);
-        fprintf(stderr, "%s: n_embd     = %u\n",  __func__, hparams.n_embd);
-        fprintf(stderr, "%s: n_mult     = %u\n",  __func__, hparams.n_mult);
-        fprintf(stderr, "%s: n_head     = %u\n",  __func__, hparams.n_head);
-        fprintf(stderr, "%s: n_layer    = %u\n",  __func__, hparams.n_layer);
-        fprintf(stderr, "%s: n_rot      = %u\n",  __func__, hparams.n_rot);
+        fprintf(stderr, "%s: format     = %s\n",   __func__, llama_file_version_name(file_version));
+        fprintf(stderr, "%s: n_vocab    = %u\n",   __func__, hparams.n_vocab);
+        fprintf(stderr, "%s: n_ctx      = %u\n",   __func__, hparams.n_ctx);
+        fprintf(stderr, "%s: n_embd     = %u\n",   __func__, hparams.n_embd);
+        fprintf(stderr, "%s: n_mult     = %u\n",   __func__, hparams.n_mult);
+        fprintf(stderr, "%s: n_head     = %u\n",   __func__, hparams.n_head);
+        fprintf(stderr, "%s: n_layer    = %u\n",   __func__, hparams.n_layer);
+        fprintf(stderr, "%s: n_rot      = %u\n",   __func__, hparams.n_rot);
+        fprintf(stderr, "%s: freq_base  = %.1f\n", __func__, hparams.rope_freq_base);
+        fprintf(stderr, "%s: freq_scale = %g\n",   __func__, hparams.rope_freq_scale);
         fprintf(stderr, "%s: ftype      = %u (%s)\n", __func__, hparams.ftype, llama_ftype_name(hparams.ftype));
-        fprintf(stderr, "%s: n_ff       = %u\n",  __func__, n_ff);
-        fprintf(stderr, "%s: model size = %s\n",  __func__, llama_model_type_name(model.type));
+        fprintf(stderr, "%s: n_ff       = %u\n",   __func__, n_ff);
+        fprintf(stderr, "%s: model size = %s\n",   __func__, llama_model_type_name(model.type));
     }
 
     if (file_version < LLAMA_FILE_VERSION_GGJT_V2) {
@@ -1094,9 +1178,9 @@ static void llama_model_load_internal(
         const size_t mem_required =
             ctx_size +
             mmapped_size - vram_weights + // weights in VRAM not in memory
-            MEM_REQ_SCRATCH0().at(model.type) +
+            MEM_REQ_SCRATCH0(hparams.n_ctx).at(model.type) +
             MEM_REQ_SCRATCH1().at(model.type) +
-            MEM_REQ_EVAL().at    (model.type);
+            MEM_REQ_EVAL(hparams.n_ctx).at(model.type);
 
         // this is the memory required by one llama_state
         const size_t mem_required_state =
@@ -1112,14 +1196,18 @@ static void llama_model_load_internal(
             fprintf(stderr, "%s: not allocating a VRAM scratch buffer due to low VRAM option\n", __func__);
             ggml_cuda_set_scratch_size(0); // disable scratch
         } else {
-            vram_scratch = n_batch * MB;
+            const size_t vram_scratch_base = VRAM_REQ_SCRATCH_BASE().at(model.type);
+            const size_t vram_scratch_per_context = VRAM_REQ_SCRATCH_PER_CONTEXT().at(model.type);
+            vram_scratch = n_batch * (vram_scratch_base + n_ctx * vram_scratch_per_context);
             ggml_cuda_set_scratch_size(vram_scratch);
             if (n_gpu_layers > 0) {
-                fprintf(stderr, "%s: allocating batch_size x 1 MB = %zd MB VRAM for the scratch buffer\n",
-                        __func__, vram_scratch / MB);
+                fprintf(stderr, "%s: allocating batch_size x (%zd kB + n_ctx x %zd B) = %zd MB VRAM for the scratch buffer\n",
+                        __func__, vram_scratch_base / kB, vram_scratch_per_context,
+                        (vram_scratch + MB - 1) / MB); // round up
             }
         }
 #endif // GGML_USE_CUBLAS
+
 #if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
         const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
 
@@ -1128,6 +1216,10 @@ static void llama_model_load_internal(
             fprintf(stderr, "%s: offloading non-repeating layers to GPU\n", __func__);
         }
         size_t vram_kv_cache = 0;
+
+#ifdef GGML_USE_CUBLAS
+        const int max_backend_supported_layers = hparams.n_layer + 3;
+        const int max_offloadable_layers = low_vram ? hparams.n_layer + 1 : hparams.n_layer + 3;
         if (n_gpu_layers > (int) hparams.n_layer + 1) {
             if (low_vram) {
                 fprintf(stderr, "%s: cannot offload v cache to GPU due to low VRAM option\n", __func__);
@@ -1144,14 +1236,18 @@ static void llama_model_load_internal(
                 vram_kv_cache += MEM_REQ_KV_SELF().at(model.type) / 2;
             }
         }
-        const int max_offloadable_layers = low_vram ? hparams.n_layer + 1 : hparams.n_layer + 3;
+#elif defined(GGML_USE_CLBLAST)
+        const int max_backend_supported_layers = hparams.n_layer + 1;
+        const int max_offloadable_layers = hparams.n_layer + 1;
+#endif // GGML_USE_CUBLAS
+
         fprintf(stderr, "%s: offloaded %d/%d layers to GPU\n",
-                __func__, std::min(n_gpu_layers, max_offloadable_layers), hparams.n_layer + 3);
+                __func__, std::min(n_gpu_layers, max_offloadable_layers), max_backend_supported_layers);
         fprintf(stderr, "%s: total VRAM used: %zu MB\n",
                 __func__, (vram_weights + vram_scratch + vram_kv_cache + MB - 1) / MB); // round up
 #else
         (void) n_gpu_layers;
-#endif
+#endif // defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
     }
 
     // populate `tensors_by_name`
@@ -1188,6 +1284,8 @@ static bool llama_model_load(
         int n_gpu_layers,
         int main_gpu,
         float * tensor_split,
+        float rope_freq_base,
+        float rope_freq_scale,
         bool low_vram,
         ggml_type memory_type,
         bool use_mmap,
@@ -1196,7 +1294,7 @@ static bool llama_model_load(
         llama_progress_callback progress_callback,
         void *progress_callback_user_data) {
     try {
-        llama_model_load_internal(fname, model, vocab, n_ctx, n_batch, n_gpu_layers, main_gpu, tensor_split, low_vram, memory_type,
+        llama_model_load_internal(fname, model, vocab, n_ctx, n_batch, n_gpu_layers, main_gpu, tensor_split, rope_freq_base, rope_freq_scale, low_vram, memory_type,
                                   use_mmap, use_mlock, vocab_only, progress_callback, progress_callback_user_data);
         return true;
     } catch (const std::exception & err) {
@@ -1218,18 +1316,16 @@ static bool llama_eval_internal(
          llama_context & lctx,
      const llama_token * tokens,
            const float * embd,
-             const int   n_tokens,
-             const int   n_past,
-             const int   n_threads,
+                   int   n_tokens,
+                   int   n_past,
+                   int   n_threads,
             const char * cgraph_fname) {
 
     LLAMA_ASSERT((!tokens && embd) || (tokens && !embd));
 
-    // enforce that the first token is BOS
-    if (tokens && n_past == 0 && tokens[0] != llama_token_bos()) {
-        fprintf(stderr, "%s: first token must be BOS\n", __func__);
-        return false;
-    }
+#ifdef GGML_USE_MPI
+    ggml_mpi_eval_init(lctx.ctx_mpi, &n_tokens, &n_past, &n_threads);
+#endif
 
     const int64_t t_start_us = ggml_time_us();
 
@@ -1250,6 +1346,9 @@ static bool llama_eval_internal(
     const int n_rot        = hparams.n_embd/hparams.n_head;
     const int n_gpu_layers = model.n_gpu_layers;
 
+    const float freq_base  = hparams.rope_freq_base;
+    const float freq_scale = hparams.rope_freq_scale;
+
     auto & mem_per_token = lctx.mem_per_token;
     auto & buf_compute   = lctx.buf_compute;
 
@@ -1261,20 +1360,26 @@ static bool llama_eval_internal(
 
     struct ggml_context * ctx0 = ggml_init(params);
 
+    ggml_cgraph gf = {};
+
     // for big prompts, if BLAS is enabled, it is better to use only one thread
     // otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
-    ggml_cgraph gf = {};
-    gf.n_threads = N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas() ? 1 : n_threads;
+    n_threads = N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas() ? 1 : n_threads;
 
     struct ggml_tensor * cur;
     struct ggml_tensor * inpL;
 
     if (tokens) {
-        struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
-        ggml_set_name(embd, "embd");
-        memcpy(embd->data, tokens, N*ggml_element_size(embd));
-        inpL = ggml_get_rows(ctx0, model.tok_embeddings, embd);
+        struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
+        memcpy(inp_tokens->data, tokens, N*ggml_element_size(inp_tokens));
+        ggml_set_name(inp_tokens, "inp_tokens");
+
+        inpL = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens);
     } else {
+#ifdef GGML_USE_MPI
+        GGML_ASSERT(false && "not implemented");
+#endif
+
         inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N);
         memcpy(inpL->data, embd, N * n_embd * ggml_element_size(inpL));
     }
@@ -1292,18 +1397,20 @@ static bool llama_eval_internal(
     offload_func_t offload_func_v  = llama_nop;
 
 #ifdef GGML_USE_CUBLAS
-        if (n_gpu_layers > n_layer) {
-            offload_func_nr = ggml_cuda_assign_buffers;
-        }
-        if (n_gpu_layers > n_layer + 1) {
-            offload_func_v  = ggml_cuda_assign_buffers;
-        }
-        if (n_gpu_layers > n_layer + 2) {
-            offload_func_kq = ggml_cuda_assign_buffers;
-        }
+    if (n_gpu_layers > n_layer) {
+        offload_func_nr = ggml_cuda_assign_buffers;
+    }
+    if (n_gpu_layers > n_layer + 1) {
+        offload_func_v  = ggml_cuda_assign_buffers;
+    }
+    if (n_gpu_layers > n_layer + 2) {
+        offload_func_kq = ggml_cuda_assign_buffers;
+    }
 #endif // GGML_USE_CUBLAS
 
     for (int il = 0; il < n_layer; ++il) {
+        ggml_format_name(inpL, "layer_inp_%d", il);
+
         offload_func_t offload_func = llama_nop;
 
 #ifdef GGML_USE_CUBLAS
@@ -1339,11 +1446,11 @@ static bool llama_eval_internal(
             offload_func_kq(tmpq);
             ggml_set_name(tmpq, "tmpq");
 
-            struct ggml_tensor * Kcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd/n_head, n_head, N), n_past, n_rot, 0, 0);
+            struct ggml_tensor * Kcur = ggml_rope_custom_inplace(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd/n_head, n_head, N), n_past, n_rot, 0, freq_base, freq_scale, 0);
             offload_func_kq(Kcur);
             ggml_set_name(Kcur, "Kcur");
 
-            struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd/n_head, n_head, N), n_past, n_rot, 0, 0);
+            struct ggml_tensor * Qcur = ggml_rope_custom_inplace(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd/n_head, n_head, N), n_past, n_rot, 0, freq_base, freq_scale, 0);
             offload_func_kq(Qcur);
             ggml_set_name(Qcur, "Qcur");
 
@@ -1510,7 +1617,6 @@ static bool llama_eval_internal(
 
         // input for next layer
         inpL = cur;
-
     }
 
     lctx.use_buf(ctx0, 0);
@@ -1518,7 +1624,6 @@ static bool llama_eval_internal(
     // used at the end to optionally extract the embeddings
     struct ggml_tensor * embeddings = NULL;
 
-
     // norm
     {
         cur = ggml_rms_norm(ctx0, inpL);
@@ -1533,7 +1638,6 @@ static bool llama_eval_internal(
         embeddings = cur;
     }
 
-
     // lm_head
     cur = ggml_mul_mat(ctx0, model.output, cur);
     ggml_set_name(cur, "result_output");
@@ -1546,8 +1650,13 @@ static bool llama_eval_internal(
     // run the computation
     ggml_build_forward_expand(&gf, cur);
 
+#if GGML_USE_MPI
+    ggml_mpi_graph_compute_pre(lctx.ctx_mpi, &gf, n_layer);
+#endif
+
 #ifdef GGML_USE_METAL
     if (lctx.ctx_metal && N == 1) {
+        ggml_metal_set_n_cb     (lctx.ctx_metal, n_threads);
         ggml_metal_graph_compute(lctx.ctx_metal, &gf);
         ggml_metal_get_tensor   (lctx.ctx_metal, cur);
     } else {
@@ -1567,12 +1676,21 @@ static bool llama_eval_internal(
             ggml_metal_get_tensor(lctx.ctx_metal, kv_self.v);
         }
 
-        ggml_graph_compute(ctx0, &gf);
+        ggml_graph_compute_helper(lctx.work_buffer, &gf, n_threads);
     }
 #else
-    ggml_graph_compute(ctx0, &gf);
+    ggml_graph_compute_helper(lctx.work_buffer, &gf, n_threads);
 #endif
 
+#if GGML_USE_MPI
+    ggml_mpi_graph_compute_post(lctx.ctx_mpi, &gf, n_layer);
+#endif
+
+    // update kv token count
+    lctx.kv_self.n = n_past + N;
+
+    struct ggml_tensor * res = gf.nodes[gf.n_nodes - 1];
+
     if (cgraph_fname) {
         ggml_graph_export(&gf, cgraph_fname);
     }
@@ -1588,23 +1706,17 @@ static bool llama_eval_internal(
     //    ggml_graph_dump_dot(&gf, NULL, "llama.dot");
     //}
 
-    //embd_w.resize(n_vocab*N);
-    //memcpy(embd_w.data(), ggml_get_data(cur), sizeof(float)*n_vocab*N);
-
-    // update kv token count
-    lctx.kv_self.n = n_past + N;
-
     // extract logits
     {
         auto & logits_out = lctx.logits;
 
         if (lctx.logits_all) {
             logits_out.resize(n_vocab * N);
-            memcpy(logits_out.data(), (float *) ggml_get_data(cur), sizeof(float)*n_vocab*N);
+            memcpy(logits_out.data(), (float *) ggml_get_data(res), sizeof(float)*n_vocab*N);
         } else {
             // return result for just the last token
             logits_out.resize(n_vocab);
-            memcpy(logits_out.data(), (float *) ggml_get_data(cur) + (n_vocab*(N-1)), sizeof(float)*n_vocab);
+            memcpy(logits_out.data(), (float *) ggml_get_data(res) + (n_vocab*(N-1)), sizeof(float)*n_vocab);
         }
     }
 
@@ -1860,10 +1972,10 @@ void llama_sample_top_p(struct llama_context * ctx, llama_token_data_array * can
         return;
     }
 
-    const int64_t t_start_sample_us = ggml_time_us();
-
     llama_sample_softmax(ctx, candidates);
 
+    const int64_t t_start_sample_us = ggml_time_us();
+
     // Compute the cumulative probabilities
     float cum_sum = 0.0f;
     size_t last_idx = candidates->size;
@@ -1892,9 +2004,8 @@ void llama_sample_tail_free(struct llama_context * ctx, llama_token_data_array *
         return;
     }
 
-    const int64_t t_start_sample_us = ggml_time_us();
-
     llama_sample_softmax(nullptr, candidates);
+    const int64_t t_start_sample_us = ggml_time_us();
 
     // Compute the first and second derivatives
     std::vector<float> first_derivatives(candidates->size - 1);
@@ -1946,11 +2057,11 @@ void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * c
         return;
     }
 
-    const int64_t t_start_sample_us = ggml_time_us();
-
     // Compute the softmax of logits and calculate entropy
     llama_sample_softmax(nullptr, candidates);
 
+    const int64_t t_start_sample_us = ggml_time_us();
+
     float entropy = 0.0f;
     for (size_t i = 0; i < candidates->size; ++i) {
         entropy += -candidates->data[i].p * logf(candidates->data[i].p);
@@ -2074,6 +2185,62 @@ void llama_sample_frequency_and_presence_penalties(struct llama_context * ctx, l
     }
 }
 
+static void llama_log_softmax(float * array, size_t size) {
+    float max_l = *std::max_element(array, array + size);
+    float sum = 0.f;
+    for (size_t i = 0; i < size; ++i) {
+        float p = expf(array[i] - max_l);
+        sum += p;
+        array[i] = p;
+    }
+
+    for (size_t i = 0; i < size; ++i) {
+        array[i] = logf(array[i] / sum);
+    }
+}
+
+void llama_sample_classifier_free_guidance(
+          struct llama_context * ctx,
+        llama_token_data_array * candidates,
+          struct llama_context * guidance_ctx,
+                         float   scale,
+                         float   smooth_factor) {
+    int64_t t_start_sample_us = ggml_time_us();
+
+    assert(ctx);
+    auto n_vocab = llama_n_vocab(ctx);
+    assert(n_vocab == (int)candidates->size);
+    assert(!candidates->sorted);
+
+    std::vector<float> logits_base;
+    logits_base.reserve(candidates->size);
+    for (size_t i = 0; i < candidates->size; ++i) {
+        logits_base.push_back(candidates->data[i].logit);
+    }
+    llama_log_softmax(logits_base.data(), candidates->size);
+
+    float* logits_guidance = llama_get_logits(guidance_ctx);
+    llama_log_softmax(logits_guidance, n_vocab);
+
+    for (int i = 0; i < n_vocab; ++i) {
+        float logit_guidance = logits_guidance[i];
+        float logit_base = logits_base[i];
+        logits_guidance[i] = scale * (logit_base - logit_guidance) + logit_guidance;
+    }
+
+    llama_log_softmax(logits_guidance, n_vocab);
+
+    for (int i = 0; i < n_vocab; ++i) {
+        float logit_base = logits_base[i];
+        float logit_guidance = logits_guidance[i];
+
+        candidates->data[i].logit = smooth_factor * logit_guidance + (1.f - smooth_factor) * logit_base;
+    }
+
+    if (ctx) {
+        ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
+    }
+}
 
 llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, int m, float * mu) {
     assert(ctx);
@@ -2119,13 +2286,11 @@ llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_
 
     if (ctx) {
         ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
-        ctx->n_sample++;
     }
     return X;
 }
 
 llama_token llama_sample_token_mirostat_v2(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, float * mu) {
-    assert(ctx);
     int64_t t_start_sample_us;
     t_start_sample_us = ggml_time_us();
 
@@ -2140,13 +2305,14 @@ llama_token llama_sample_token_mirostat_v2(struct llama_context * ctx, llama_tok
         candidates->size = 1;
     }
 
+    if (ctx) {
+        ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
+    }
+
     // Normalize the probabilities of the remaining words
     llama_sample_softmax(ctx, candidates);
 
     // Sample the next word X from the remaining words
-    if (ctx) {
-        ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
-    }
     llama_token X = llama_sample_token(ctx, candidates);
     t_start_sample_us = ggml_time_us();
 
@@ -2214,10 +2380,10 @@ static void llama_convert_tensor_internal(const llama_load_tensor & tensor, llam
     }
     float * f32_output = (float *) output.addr;
 
-    quantize_fns_t qtype;
+    ggml_type_traits_t qtype;
     if (ggml_is_quantized(tensor.type)) {
-        qtype = ggml_internal_get_quantize_fn(tensor.type);
-        if (qtype.dequantize_row_q == NULL) {
+        qtype = ggml_internal_get_type_traits(tensor.type);
+        if (qtype.to_float == NULL) {
             throw std::runtime_error(format("type %s unsupported for integer quantization: no dequantization available", ggml_type_name(tensor.type)));
         }
     } else if (tensor.type != GGML_TYPE_F16) {
@@ -2228,7 +2394,7 @@ static void llama_convert_tensor_internal(const llama_load_tensor & tensor, llam
         if (tensor.type == GGML_TYPE_F16) {
             ggml_fp16_to_fp32_row((ggml_fp16_t *)tensor.data, f32_output, nelements);
         } else if (ggml_is_quantized(tensor.type)) {
-            qtype.dequantize_row_q(tensor.data, f32_output, nelements);
+            qtype.to_float(tensor.data, f32_output, nelements);
         } else {
             LLAMA_ASSERT(false); // unreachable
         }
@@ -2253,7 +2419,7 @@ static void llama_convert_tensor_internal(const llama_load_tensor & tensor, llam
             if (typ == GGML_TYPE_F16) {
                 ggml_fp16_to_fp32_row((ggml_fp16_t *)inbuf, outbuf, nels);
             } else {
-                qtype.dequantize_row_q(inbuf, outbuf, nels);
+                qtype.to_float(inbuf, outbuf, nels);
             }
         };
         workers.push_back(std::thread(compute, tensor.type, tensor.data + in_buff_offs, f32_output + out_buff_offs, thr_elems));
@@ -2362,15 +2528,14 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
         } else {
             new_type = quantized_type;
 #ifdef GGML_USE_K_QUANTS
+            bool convert_incompatible_tensor = false;
             if (quantized_type == GGML_TYPE_Q2_K || quantized_type == GGML_TYPE_Q3_K || quantized_type == GGML_TYPE_Q4_K ||
                 quantized_type == GGML_TYPE_Q5_K || quantized_type == GGML_TYPE_Q6_K) {
                 int nx = tensor.ne.at(0);
                 int ny = tensor.ne.at(1);
                 if (nx % QK_K != 0 || ny % QK_K != 0) {
-                    fprintf(stderr, "\n\n========================= Tensor sizes %d x %d are not divisible by %d\n",nx,ny,QK_K);
-                    fprintf(stderr, "This is required to be able to use k-quants for now!\n");
-                    fprintf(stderr, "========================================================================================\n\n");
-                    throw std::runtime_error("Unsupported tensor size encountered\n");
+                    fprintf(stderr, "\n\nTensor sizes %d x %d are not divisible by %d, required for k-quants.\n",nx,ny,QK_K);
+                    convert_incompatible_tensor = true;
                 }
             }
             if (tensor.name == "output.weight") {
@@ -2398,6 +2563,17 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
                 if      (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K;
                 else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
             }
+            if (convert_incompatible_tensor) {
+                if (tensor.name == "output.weight") {
+                    new_type = GGML_TYPE_F16; //fall back to F16 instead of just failing.
+                    fprintf(stderr, "F16 will be used for this tensor instead.\n");
+                } else if (tensor.name == "tok_embeddings.weight") {
+                    new_type = GGML_TYPE_Q4_0; //fall back to Q4_0 instead of just failing.
+                    fprintf(stderr, "Q4_0 will be used for this tensor instead.\n");
+                } else {
+                    throw std::runtime_error("Unsupported tensor size encountered\n");
+                }
+            }
 #endif
 
             float * f32_data;
@@ -2517,8 +2693,9 @@ struct llama_model * llama_load_model_from_file(
     ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;
 
     if (!llama_model_load(path_model, *model, model->vocab, params.n_ctx, params.n_batch, params.n_gpu_layers,
-                params.main_gpu, params.tensor_split, params.low_vram, memory_type, params.use_mmap, params.use_mlock,
-                params.vocab_only, params.progress_callback, params.progress_callback_user_data)) {
+                params.main_gpu, params.tensor_split, params.rope_freq_base, params.rope_freq_scale,params.low_vram,
+                memory_type, params.use_mmap, params.use_mlock, params.vocab_only, params.progress_callback,
+                params.progress_callback_user_data)) {
         delete model;
         fprintf(stderr, "%s: failed to load model\n", __func__);
         return nullptr;
@@ -2532,14 +2709,14 @@ void llama_free_model(struct llama_model * model) {
 }
 
 struct llama_context * llama_new_context_with_model(
-                             struct llama_model * model,
-            struct llama_context_params   params) {
+                 struct llama_model * model,
+        struct llama_context_params   params) {
 
     if (!model) {
         return nullptr;
     }
 
-    llama_context * ctx = new llama_context(*model, model->vocab);
+    llama_context * ctx = new llama_context(*model);
 
     if (params.seed == LLAMA_DEFAULT_SEED) {
         params.seed = time(NULL);
@@ -2593,16 +2770,16 @@ struct llama_context * llama_new_context_with_model(
             ctx->embedding.resize(hparams.n_embd);
         }
 
-        ctx->buf_compute.resize(MEM_REQ_EVAL().at(ctx->model.type));
+        ctx->buf_compute.resize(MEM_REQ_EVAL(hparams.n_ctx).at(ctx->model.type));
 
-        ctx->buf_scratch[0].resize(MEM_REQ_SCRATCH0().at(ctx->model.type));
+        ctx->buf_scratch[0].resize(MEM_REQ_SCRATCH0(hparams.n_ctx).at(ctx->model.type));
         ctx->buf_scratch[1].resize(MEM_REQ_SCRATCH1().at(ctx->model.type));
     }
 
 #ifdef GGML_USE_METAL
     if (params.n_gpu_layers > 0) {
         // this allocates all Metal resources and memory buffers
-        ctx->ctx_metal = ggml_metal_init();
+        ctx->ctx_metal = ggml_metal_init(1);
 
         void * data_ptr  = NULL;
         size_t data_size = 0;
@@ -2637,6 +2814,18 @@ struct llama_context * llama_new_context_with_model(
     }
 #endif
 
+#ifdef GGML_USE_MPI
+    ctx->ctx_mpi = ggml_mpi_init();
+
+    if (ggml_mpi_rank(ctx->ctx_mpi) > 0) {
+        // Enter a blocking eval loop with dummy input, letting rank=0 drive the process
+        const std::vector<llama_token> tmp(ctx->model.hparams.n_ctx, llama_token_bos());
+        while (!llama_eval(ctx, tmp.data(), tmp.size(), 0, 0)) {};
+        llama_backend_free();
+        exit(1);
+    }
+#endif
+
     return ctx;
 }
 
@@ -2759,6 +2948,9 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
     // read tensors and apply
     bool warned = false;
     int n_tensors = 0;
+
+    std::vector<uint8_t> work_buffer;
+
     while (true) {
         int32_t n_dims;
         int32_t length;
@@ -2923,8 +3115,8 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
             }
 
             struct ggml_cgraph gf = ggml_build_forward(r);
-            gf.n_threads = n_threads;
-            ggml_graph_compute(lora_ctx, &gf);
+
+            ggml_graph_compute_helper(work_buffer, &gf, n_threads);
 
             // we won't need these tensors again, reset the context to save memory
             ggml_free(lora_ctx);
@@ -3077,7 +3269,6 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) {
 
             ggml_context * cpy_ctx = ggml_init({ 4096, NULL, /* no_alloc */ true });
             ggml_cgraph gf{};
-            gf.n_threads = 1;
 
             ggml_tensor * kout3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_ntok, n_layer);
             kout3d->data = out;
@@ -3097,7 +3288,7 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) {
 
             ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, k3d, kout3d));
             ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, v3d, vout3d));
-            ggml_graph_compute(cpy_ctx, &gf);
+            ggml_graph_compute_helper(ctx->work_buffer, &gf, /*n_threads*/ 1);
 
             ggml_free(cpy_ctx);
         }
@@ -3183,7 +3374,6 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
 
             ggml_context * cpy_ctx = ggml_init({ 4096, NULL, /* no_alloc */ true });
             ggml_cgraph gf{};
-            gf.n_threads = 1;
 
             ggml_tensor * kin3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_ntok, n_layer);
             kin3d->data = (void *) inp;
@@ -3203,7 +3393,7 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
 
             ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, kin3d, k3d));
             ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, vin3d, v3d));
-            ggml_graph_compute(cpy_ctx, &gf);
+            ggml_graph_compute_helper(ctx->work_buffer, &gf, /*n_threads*/ 1);
 
             ggml_free(cpy_ctx);
         }
@@ -3219,7 +3409,7 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
     return nread;
 }
 
-bool llama_load_session_file(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) {
+static bool llama_load_session_file_internal(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) {
     llama_file file(path_session, "rb");
 
     // sanity checks
@@ -3273,6 +3463,15 @@ bool llama_load_session_file(struct llama_context * ctx, const char * path_sessi
     return true;
 }
 
+bool llama_load_session_file(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) {
+    try {
+        return llama_load_session_file_internal(ctx, path_session, tokens_out, n_token_capacity, n_token_count_out);
+    } catch (const std::exception & err) {
+        fprintf(stderr, "error loading session file: %s\n", err.what());
+        return false;
+    }
+}
+
 bool llama_save_session_file(struct llama_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count) {
     llama_file file(path_session, "wb");
 
@@ -3355,13 +3554,13 @@ int llama_eval_export(struct llama_context * ctx, const char * fname) {
     return 0;
 }
 
-int llama_tokenize(
-        struct llama_context * ctx,
+int llama_tokenize_with_model(
+    const struct llama_model * model,
                   const char * text,
                  llama_token * tokens,
                          int   n_max_tokens,
                         bool   add_bos) {
-    auto res = llama_tokenize(ctx->vocab, text, add_bos);
+    auto res = llama_tokenize(model->vocab, text, add_bos);
 
     if (n_max_tokens < (int) res.size()) {
         fprintf(stderr, "%s: too many tokens\n", __func__);
@@ -3375,8 +3574,29 @@ int llama_tokenize(
     return res.size();
 }
 
+int llama_tokenize(
+        struct llama_context * ctx,
+                  const char * text,
+                 llama_token * tokens,
+                         int   n_max_tokens,
+                        bool   add_bos) {
+    return llama_tokenize_with_model(&ctx->model, text, tokens, n_max_tokens, add_bos);
+}
+
+int llama_n_vocab_from_model(const struct llama_model * model) {
+    return model->vocab.id_to_token.size();
+}
+
+int llama_n_ctx_from_model(const struct llama_model * model) {
+    return model->hparams.n_ctx;
+}
+
+int llama_n_embd_from_model(const struct llama_model * model) {
+    return model->hparams.n_embd;
+}
+
 int llama_n_vocab(const struct llama_context * ctx) {
-    return ctx->vocab.id_to_token.size();
+    return ctx->model.vocab.id_to_token.size();
 }
 
 int llama_n_ctx(const struct llama_context * ctx) {
@@ -3387,17 +3607,25 @@ int llama_n_embd(const struct llama_context * ctx) {
     return ctx->model.hparams.n_embd;
 }
 
+int llama_get_vocab_from_model(
+        const struct llama_model * model,
+        const char * * strings,
+        float  * scores,
+        int capacity) {
+    int n = std::min(capacity, (int) model->vocab.id_to_token.size());
+    for (int i = 0; i<n; ++i) {
+        strings[i] = model->vocab.id_to_token[i].tok.c_str();
+        scores[i]  = model->vocab.id_to_token[i].score;
+    }
+    return n;
+}
+
 int llama_get_vocab(
         const struct llama_context * ctx,
         const char * * strings,
         float  * scores,
         int capacity) {
-    int n = std::min(capacity, (int) ctx->vocab.id_to_token.size());
-    for (int i = 0; i<n; ++i) {
-        strings[i] = ctx->vocab.id_to_token[i].tok.c_str();
-        scores[i]  = ctx->vocab.id_to_token[i].score;
-    }
-    return n;
+    return llama_get_vocab_from_model(&ctx->model, strings, scores, capacity);
 }
 
 float * llama_get_logits(struct llama_context * ctx) {
@@ -3408,12 +3636,16 @@ float * llama_get_embeddings(struct llama_context * ctx) {
     return ctx->embedding.data();
 }
 
-const char * llama_token_to_str(const struct llama_context * ctx, llama_token token) {
-    if (token >= llama_n_vocab(ctx)) {
+const char * llama_token_to_str_with_model(const struct llama_model * model, llama_token token) {
+    if (token >= llama_n_vocab_from_model(model)) {
         return nullptr;
     }
 
-    return ctx->vocab.id_to_token[token].tok.c_str();
+    return model->vocab.id_to_token[token].tok.c_str();
+}
+
+const char * llama_token_to_str(const struct llama_context * ctx, llama_token token) {
+    return llama_token_to_str_with_model(&ctx->model, token);
 }
 
 llama_token llama_token_bos() {
@@ -3428,23 +3660,35 @@ llama_token llama_token_nl() {
     return 13;
 }
 
+struct llama_timings llama_get_timings(struct llama_context * ctx) {
+    struct llama_timings result = {
+        /*.t_start_ms  =*/ 1e-3 * ctx->t_start_us,
+        /*.t_end_ms    =*/ 1.00 * ggml_time_ms(),
+        /*.t_load_ms   =*/ 1e-3 * ctx->t_load_us,
+        /*.t_sample_ms =*/ 1e-3 * ctx->t_sample_us,
+        /*.t_p_eval_ms =*/ 1e-3 * ctx->t_p_eval_us,
+        /*.t_eval_ms   =*/ 1e-3 * ctx->t_eval_us,
+
+        /*.n_sample =*/ std::max(1, ctx->n_sample),
+        /*.n_p_eval =*/ std::max(1, ctx->n_p_eval),
+        /*.n_eval   =*/ std::max(1, ctx->n_eval),
+    };
+
+    return result;
+}
 
 void llama_print_timings(struct llama_context * ctx) {
-    const int64_t t_end_us = ggml_time_us();
-
-    const int32_t n_sample = std::max(1, ctx->n_sample);
-    const int32_t n_eval   = std::max(1, ctx->n_eval);
-    const int32_t n_p_eval = std::max(1, ctx->n_p_eval);
+    const llama_timings timings = llama_get_timings(ctx);
 
     fprintf(stderr, "\n");
-    fprintf(stderr, "%s:        load time = %8.2f ms\n", __func__, ctx->t_load_us / 1000.0);
+    fprintf(stderr, "%s:        load time = %8.2f ms\n", __func__, timings.t_load_ms);
     fprintf(stderr, "%s:      sample time = %8.2f ms / %5d runs   (%8.2f ms per token, %8.2f tokens per second)\n",
-            __func__, 1e-3 * ctx->t_sample_us, n_sample, 1e-3 * ctx->t_sample_us / n_sample, 1e6 / ctx->t_sample_us * n_sample);
+            __func__, timings.t_sample_ms, timings.n_sample, timings.t_sample_ms / timings.n_sample, 1e3 / timings.t_sample_ms * timings.n_sample);
     fprintf(stderr, "%s: prompt eval time = %8.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n",
-            __func__, 1e-3 * ctx->t_p_eval_us, n_p_eval, 1e-3 * ctx->t_p_eval_us / n_p_eval, 1e6 / ctx->t_p_eval_us * n_p_eval);
+            __func__, timings.t_p_eval_ms, timings.n_p_eval, timings.t_p_eval_ms / timings.n_p_eval, 1e3 / timings.t_p_eval_ms * timings.n_p_eval);
     fprintf(stderr, "%s:        eval time = %8.2f ms / %5d runs   (%8.2f ms per token, %8.2f tokens per second)\n",
-            __func__, 1e-3 * ctx->t_eval_us,   n_eval,   1e-3 * ctx->t_eval_us   / n_eval,   1e6 / ctx->t_eval_us   * n_eval);
-    fprintf(stderr, "%s:       total time = %8.2f ms\n", __func__, (t_end_us - ctx->t_start_us)/1000.0);
+            __func__, timings.t_eval_ms, timings.n_eval, timings.t_eval_ms / timings.n_eval, 1e3 / timings.t_eval_ms * timings.n_eval);
+    fprintf(stderr, "%s:       total time = %8.2f ms\n", __func__, (timings.t_end_ms - timings.t_start_ms));
 }
 
 void llama_reset_timings(struct llama_context * ctx) {
diff --git a/llama.h b/llama.h
index 5bb1964bd..e744584f2 100644
--- a/llama.h
+++ b/llama.h
@@ -89,6 +89,11 @@ extern "C" {
         int32_t  n_gpu_layers;                 // number of layers to store in VRAM
         int32_t  main_gpu;                     // the GPU that is used for scratch and small tensors
         float tensor_split[LLAMA_MAX_DEVICES]; // how to split layers across multiple GPUs
+
+        // ref: https://github.com/ggerganov/llama.cpp/pull/2054
+        float    rope_freq_base;  // RoPE base frequency
+        float    rope_freq_scale; // RoPE frequency scaling factor
+
         // called with a progress value between 0 and 1, pass NULL to disable
         llama_progress_callback progress_callback;
         // context pointer passed to the progress callback
@@ -134,6 +139,20 @@ extern "C" {
         bool quantize_output_tensor; // quantize output.weight
     } llama_model_quantize_params;
 
+    // performance timing information
+    struct llama_timings {
+        double t_start_ms;
+        double t_end_ms;
+        double t_load_ms;
+        double t_sample_ms;
+        double t_p_eval_ms;
+        double t_eval_ms;
+
+        int32_t n_sample;
+        int32_t n_p_eval;
+        int32_t n_eval;
+    };
+
     LLAMA_API struct llama_context_params llama_context_default_params();
     LLAMA_API struct llama_model_quantize_params llama_model_quantize_default_params();
 
@@ -144,7 +163,9 @@ extern "C" {
     // Initialize the llama + ggml backend
     // If numa is true, use NUMA optimizations
     // Call once at the start of the program
-    LLAMA_API void llama_init_backend(bool numa);
+    LLAMA_API void llama_backend_init(bool numa);
+    // Call once at the end of the program - currently only used for MPI
+    LLAMA_API void llama_backend_free();
 
     LLAMA_API int64_t llama_time_us();
 
@@ -254,10 +275,21 @@ extern "C" {
                              int   n_max_tokens,
                             bool   add_bos);
 
+    LLAMA_API int llama_tokenize_with_model(
+        const struct llama_model * model,
+                      const char * text,
+                     llama_token * tokens,
+                             int   n_max_tokens,
+                            bool   add_bos);
+
     LLAMA_API int llama_n_vocab(const struct llama_context * ctx);
     LLAMA_API int llama_n_ctx  (const struct llama_context * ctx);
     LLAMA_API int llama_n_embd (const struct llama_context * ctx);
 
+    LLAMA_API int llama_n_vocab_from_model(const struct llama_model * model);
+    LLAMA_API int llama_n_ctx_from_model  (const struct llama_model * model);
+    LLAMA_API int llama_n_embd_from_model (const struct llama_model * model);
+
     // Get the vocabulary as output parameters.
     // Returns number of results.
     LLAMA_API int llama_get_vocab(
@@ -266,6 +298,12 @@ extern "C" {
                                  float * scores,
                                    int   capacity);
 
+    LLAMA_API int llama_get_vocab_from_model(
+              const struct llama_model * model,
+                          const char * * strings,
+                                 float * scores,
+                                   int   capacity);
+
     // Token logits obtained from the last call to llama_eval()
     // The logits for the last token are stored in the last row
     // Can be mutated in order to change the probabilities of the next token
@@ -278,7 +316,13 @@ extern "C" {
     LLAMA_API float * llama_get_embeddings(struct llama_context * ctx);
 
     // Token Id -> String. Uses the vocabulary in the provided context
-    LLAMA_API const char * llama_token_to_str(const struct llama_context * ctx, llama_token token);
+    LLAMA_API const char * llama_token_to_str(
+            const struct llama_context * ctx,
+                           llama_token   token);
+
+    LLAMA_API const char * llama_token_to_str_with_model(
+              const struct llama_model * model,
+                           llama_token   token);
 
     // Special tokens
     LLAMA_API llama_token llama_token_bos();  // beginning-of-sentence
@@ -293,6 +337,18 @@ extern "C" {
     /// @details Frequency and presence penalties described in OpenAI API https://platform.openai.com/docs/api-reference/parameter-details.
     LLAMA_API void llama_sample_frequency_and_presence_penalties(struct llama_context * ctx, llama_token_data_array * candidates, const llama_token * last_tokens, size_t last_tokens_size, float alpha_frequency, float alpha_presence);
 
+    /// @details Apply classifier-free guidance to the logits as described in academic paper "Stay on topic with Classifier-Free Guidance" https://arxiv.org/abs/2306.17806
+    /// @param candidates A vector of `llama_token_data` containing the candidate tokens, the logits must be directly extracted from the original generation context without being sorted.
+    /// @params guidance_ctx A separate context from the same model. Other than a negative prompt at the beginning, it should have all generated and user input tokens copied from the main context.
+    /// @params scale Guidance strength. 1.0f means no guidance. Higher values mean stronger guidance.
+    /// @params smooth_factor Smooth factor between guidance logits and original logits. 1.0f means only use guidance logits. 0.0f means only original logits.
+    LLAMA_API void llama_sample_classifier_free_guidance(
+              struct llama_context * ctx,
+            llama_token_data_array * candidates,
+              struct llama_context * guidance_ctx,
+                             float   scale,
+                             float   smooth_factor);
+
     /// @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits.
     LLAMA_API void llama_sample_softmax(struct llama_context * ctx, llama_token_data_array * candidates);
 
@@ -331,6 +387,7 @@ extern "C" {
     LLAMA_API llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_array * candidates);
 
     // Performance information
+    LLAMA_API struct llama_timings llama_get_timings(struct llama_context * ctx);
     LLAMA_API void llama_print_timings(struct llama_context * ctx);
     LLAMA_API void llama_reset_timings(struct llama_context * ctx);
 
diff --git a/pocs/vdot/q8dot.cpp b/pocs/vdot/q8dot.cpp
index 5748c8ac2..4e0e02357 100644
--- a/pocs/vdot/q8dot.cpp
+++ b/pocs/vdot/q8dot.cpp
@@ -136,7 +136,7 @@ int main(int argc, char** argv) {
 
     auto ggml_type = type == 0 ? GGML_TYPE_Q4_0 : GGML_TYPE_Q4_1;
 
-    auto funcs = ggml_internal_get_quantize_fn(ggml_type);
+    auto funcs = ggml_internal_get_type_traits(ggml_type);
 
     Stat simple, ggml;
 
@@ -156,8 +156,8 @@ int main(int argc, char** argv) {
 
         t1 = std::chrono::high_resolution_clock::now();
         float fs;
-        if (type == 0) funcs.vec_dot_q(kVecSize * QK4_1, &fs, x40.data(), y.data());
-        else funcs.vec_dot_q(kVecSize * QK4_1, &fs, x41.data(), y.data());
+        if (type == 0) funcs.vec_dot(kVecSize * QK4_1, &fs, x40.data(), y.data());
+        else funcs.vec_dot(kVecSize * QK4_1, &fs, x41.data(), y.data());
         t2 = std::chrono::high_resolution_clock::now();
         t = 1e-3*std::chrono::duration_cast<std::chrono::nanoseconds>(t2-t1).count();
         if (iloop > 3) ggml.addResult(fs, t);
diff --git a/pocs/vdot/vdot.cpp b/pocs/vdot/vdot.cpp
index 7b18090d6..48758cda8 100644
--- a/pocs/vdot/vdot.cpp
+++ b/pocs/vdot/vdot.cpp
@@ -235,7 +235,7 @@ int main(int argc, char** argv) {
     int n4 = useQ4_1 ? kVecSize / QK4_1 : kVecSize / QK4_0; n4 = 64*((n4 + 63)/64);
     int n8 = kVecSize / QK8_0; n8 = 64*((n8 + 63)/64);
 
-    auto funcs = useQ4_1 ? ggml_internal_get_quantize_fn(GGML_TYPE_Q4_1) : ggml_internal_get_quantize_fn(GGML_TYPE_Q4_0);
+    auto funcs = useQ4_1 ? ggml_internal_get_type_traits(GGML_TYPE_Q4_1) : ggml_internal_get_type_traits(GGML_TYPE_Q4_0);
 
     std::vector<block_q4_0> q40;
     std::vector<block_q4_1> q41;
@@ -261,9 +261,9 @@ int main(int argc, char** argv) {
         // Note, we do not include this in the timing as in practical application
         // we already have the quantized model weights.
         if (useQ4_1) {
-            funcs.quantize_row_q(x1.data(), q41.data(), kVecSize);
+            funcs.from_float(x1.data(), q41.data(), kVecSize);
         } else {
-            funcs.quantize_row_q(x1.data(), q40.data(), kVecSize);
+            funcs.from_float(x1.data(), q40.data(), kVecSize);
         }
 
         // Now measure time the dot product needs using the "scalar" version above
@@ -282,9 +282,10 @@ int main(int argc, char** argv) {
             dot_q4_q8(kVecSize, &result, q40.data(), q8.data());
         }
         else {
-            funcs.quantize_row_q_dot(y1.data(), q8.data(), kVecSize);
-            if (useQ4_1) funcs.vec_dot_q(kVecSize, &result, q41.data(), q8.data());
-            else funcs.vec_dot_q(kVecSize, &result, q40.data(), q8.data());
+            auto vdot = ggml_internal_get_type_traits(funcs.vec_dot_type);
+            vdot.from_float(y1.data(), q8.data(), kVecSize);
+            if (useQ4_1) funcs.vec_dot(kVecSize, &result, q41.data(), q8.data());
+            else funcs.vec_dot(kVecSize, &result, q40.data(), q8.data());
         }
         sumq += result;
         t2 = std::chrono::high_resolution_clock::now();
diff --git a/scripts/sync-ggml.sh b/scripts/sync-ggml.sh
index e6e39ff8f..02ea6ec15 100755
--- a/scripts/sync-ggml.sh
+++ b/scripts/sync-ggml.sh
@@ -1,6 +1,14 @@
 #!/bin/bash
 
-cp -rpv ../ggml/src/ggml.c          ./ggml.c
-cp -rpv ../ggml/src/ggml-cuda.cu    ./ggml-cuda.cu
-cp -rpv ../ggml/src/ggml-cuda.h     ./ggml-cuda.h
-cp -rpv ../ggml/include/ggml/ggml.h ./ggml.h
+cp -rpv ../ggml/src/ggml.c           ./ggml.c
+cp -rpv ../ggml/src/ggml-cuda.h      ./ggml-cuda.h
+cp -rpv ../ggml/src/ggml-cuda.cu     ./ggml-cuda.cu
+cp -rpv ../ggml/src/ggml-opencl.h    ./ggml-opencl.h
+cp -rpv ../ggml/src/ggml-opencl.cpp  ./ggml-opencl.cpp
+cp -rpv ../ggml/src/ggml-metal.h     ./ggml-metal.h
+cp -rpv ../ggml/src/ggml-metal.m     ./ggml-metal.m
+cp -rpv ../ggml/src/ggml-metal.metal ./ggml-metal.metal
+cp -rpv ../ggml/include/ggml/ggml.h  ./ggml.h
+
+cp -rpv ../ggml/tests/test-opt.c    ./tests/test-opt.c
+cp -rpv ../ggml/tests/test-grad0.c  ./tests/test-grad0.c
diff --git a/scripts/verify-checksum-models.py b/scripts/verify-checksum-models.py
old mode 100644
new mode 100755
index d12748281..307b7c08d
--- a/scripts/verify-checksum-models.py
+++ b/scripts/verify-checksum-models.py
@@ -1,3 +1,5 @@
+#!/bin/env python3
+
 import os
 import hashlib
 
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 4171c126c..1acf050a7 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -10,5 +10,5 @@ llama_add_test(test-quantize-fns.cpp)
 llama_add_test(test-quantize-perf.cpp)
 llama_add_test(test-sampling.cpp)
 llama_add_test(test-tokenizer-0.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab.bin)
-# llama_add_test(test-grad0.c) # SLOW
+llama_add_test(test-grad0.c) # SLOW
 # llama_add_test(test-opt.c) # SLOW
diff --git a/tests/test-grad0.c b/tests/test-grad0.c
index b5a499c1d..01467bc18 100644
--- a/tests/test-grad0.c
+++ b/tests/test-grad0.c
@@ -10,6 +10,10 @@
 #pragma warning(disable: 4244 4267) // possible loss of data
 #endif
 
+#if defined(__GNUC__)
+#pragma GCC diagnostic ignored "-Wdouble-promotion"
+#endif
+
 #define MAX_NARGS 3
 
 #undef MIN
@@ -49,7 +53,7 @@ float frand(void) {
 
 int irand(int n) {
     if (n == 0) return 0;
-    else return rand()%n;
+    return rand()%n;
 }
 
 void get_random_dims(int64_t * dims, int ndims) {
@@ -159,12 +163,14 @@ struct ggml_tensor * get_random_tensor_int(
 float get_element(const struct ggml_tensor * t, int idx) {
     if (t->type == GGML_TYPE_F32) {
         return ((float *)t->data)[idx];
-    } else if (t->type == GGML_TYPE_I32) {
-        return ((int32_t *)t->data)[idx];
-    } else {
-        assert(false);
-        return INFINITY;
     }
+
+    if (t->type == GGML_TYPE_I32) {
+        return ((int32_t *)t->data)[idx];
+    }
+
+    assert(false);
+    return INFINITY;
 }
 
 void set_element(struct ggml_tensor * t, int idx, float value) {
@@ -215,15 +221,14 @@ bool check_gradient(
     }
 
     struct ggml_cgraph gf = ggml_build_forward (f);
-    gf.n_threads = n_threads;
-
     struct ggml_cgraph gb = ggml_build_backward(ctx0, &gf, false);
-    gb.n_threads = n_threads;
 
-    ggml_graph_compute(ctx0, &gf);
+    ggml_graph_compute_with_ctx(ctx0, &gf, n_threads);
+
     ggml_graph_reset  (&gf);
     ggml_set_f32      (f->grad, 1.0f);
-    ggml_graph_compute(ctx0, &gb);
+
+    ggml_graph_compute_with_ctx(ctx0, &gb, n_threads);
 
     // ggml_graph_dump_dot(&gf, NULL, "test-grad0-forward.dot");
     // ggml_graph_dump_dot(&gb, &gf,  "test-grad0-backward.dot");
@@ -236,15 +241,16 @@ bool check_gradient(
             const float xm = x0 - eps;
             const float xp = x0 + eps;
             set_element(x[i], k, xp);
-            ggml_graph_compute(ctx0, &gf);
+
+            ggml_graph_compute_with_ctx(ctx0, &gf, n_threads);
 
             const float f0 = ggml_get_f32_1d(f, 0);
 
             set_element(x[i], k, xm);
-            ggml_graph_compute(ctx0, &gf);
+
+            ggml_graph_compute_with_ctx(ctx0, &gf, n_threads);
 
             const float f1 = ggml_get_f32_1d(f, 0);
-
             const float g0 = (f0 - f1)/(2.0f*eps);
 
             set_element(x[i], k, x0);
@@ -252,12 +258,13 @@ bool check_gradient(
             // compute gradient using backward graph
             ggml_graph_reset  (&gf);
             ggml_set_f32      (f->grad, 1.0f);
-            ggml_graph_compute(ctx0, &gb);
+
+            ggml_graph_compute_with_ctx(ctx0, &gb, n_threads);
 
             const float g1 = get_element(x[i]->grad, k);
 
             const float error_abs = fabsf(g0 - g1);
-            const float error_rel = g0 != 0 ? fabsf(g0 - g1)/fabs(g0) : 0;
+            const float error_rel = g0 != 0 ? fabsf(g0 - g1)/fabsf(g0) : 0;
 
             if (error_abs > max_error_abs || error_rel > max_error_rel) {
                 printf("%s: ndims=%d, i=%d, k=%d, x0=%f, xm=%f, xp=%f, f0=%f, f1=%f, g0=%f, g1=%f, eps=%f, error_abs=%f, error_rel=%f\n",
@@ -1154,7 +1161,7 @@ int main(int argc, const char ** argv) {
                             continue;
                         }
 
-                        struct ggml_tensor * f = ggml_sum(ctx0, ggml_rope(ctx0, x[0], n_past, n_rot, mode));
+                        struct ggml_tensor * f = ggml_sum(ctx0, ggml_rope(ctx0, x[0], n_past, n_rot, mode, 0));
 
                         GGML_PRINT_DEBUG("rope: n_past: %d n_rot: %d mode: %d\n", n_past, n_rot, mode);
                         check_gradient("rope", ctx0, x, f, ndims, nargs, 1e-2f, 1e-3f, INFINITY);
diff --git a/tests/test-opt.c b/tests/test-opt.c
index d001615ee..5531814c4 100644
--- a/tests/test-opt.c
+++ b/tests/test-opt.c
@@ -7,6 +7,9 @@
 
 #define MAX_NARGS 2
 
+#if defined(__GNUC__)
+#pragma GCC diagnostic ignored "-Wdouble-promotion"
+#endif
 
 //
 // logging
@@ -33,7 +36,7 @@
 #define GGML_PRINT(...) printf(__VA_ARGS__)
 
 
-float frand() {
+float frand(void) {
     return (float)rand()/(float)RAND_MAX;
 }
 
@@ -114,7 +117,7 @@ void set_element(struct ggml_tensor * t, int idx, float value) {
     ((float *)t->data)[idx] = value;
 }
 
-int main(int argc, const char ** argv) {
+int main(void) {
     struct ggml_init_params params = {
         .mem_size   = 1024*1024*1024,
         .mem_buffer = NULL,
@@ -137,10 +140,11 @@ int main(int argc, const char ** argv) {
     struct ggml_tensor * d  = ggml_sub(ctx, c, ab);
     struct ggml_tensor * e  = ggml_sum(ctx, ggml_sqr(ctx, d));
 
-
     struct ggml_cgraph ge = ggml_build_forward(e);
-    ggml_graph_reset  (&ge);
-    ggml_graph_compute(ctx, &ge);
+    ggml_graph_reset(&ge);
+
+    ggml_graph_compute_with_ctx(ctx, &ge, /*n_threads*/ 1);
+
     const float fe = ggml_get_f32_1d(e, 0);
     printf("%s: e = %.4f\n", __func__, fe);
 
@@ -148,8 +152,10 @@ int main(int argc, const char ** argv) {
 
     ggml_opt(ctx, opt_params, e);
 
-    ggml_graph_reset  (&ge);
-    ggml_graph_compute(ctx, &ge);
+    ggml_graph_reset(&ge);
+
+    ggml_graph_compute_with_ctx(ctx, &ge, /*n_threads*/ 1);
+
     const float fe_opt = ggml_get_f32_1d(e, 0);
     printf("%s: original  e = %.4f\n", __func__, fe);
     printf("%s: optimized e = %.4f\n", __func__, fe_opt);
diff --git a/tests/test-quantize-fns.cpp b/tests/test-quantize-fns.cpp
index c40f1b29c..8d3c162d2 100644
--- a/tests/test-quantize-fns.cpp
+++ b/tests/test-quantize-fns.cpp
@@ -40,26 +40,26 @@ float array_rmse(const float * a1, const float * a2, size_t n) {
 }
 
 // Total quantization error on test data
-float total_quantization_error(quantize_fns_t & qfns, size_t test_size, const float * test_data) {
+float total_quantization_error(ggml_type_traits_t & qfns, size_t test_size, const float * test_data) {
     std::vector<uint8_t> tmp_q(2*test_size);
     std::vector<float> tmp_out(test_size);
 
-    qfns.quantize_row_q(test_data, tmp_q.data(), test_size);
-    qfns.dequantize_row_q(tmp_q.data(), tmp_out.data(), test_size);
+    qfns.from_float(test_data, tmp_q.data(), test_size);
+    qfns.to_float(tmp_q.data(), tmp_out.data(), test_size);
     return array_rmse(test_data, tmp_out.data(), test_size);
 }
 
 // Total quantization error on test data
-float reference_quantization_error(quantize_fns_t & qfns, size_t test_size, const float * test_data) {
+float reference_quantization_error(ggml_type_traits_t & qfns, size_t test_size, const float * test_data) {
     std::vector<uint8_t> tmp_q(2*test_size);
     std::vector<float> tmp_out(test_size);
     std::vector<float> tmp_out_ref(test_size);
 
-    qfns.quantize_row_q(test_data, tmp_q.data(), test_size);
-    qfns.dequantize_row_q(tmp_q.data(), tmp_out.data(), test_size);
+    qfns.from_float(test_data, tmp_q.data(), test_size);
+    qfns.to_float(tmp_q.data(), tmp_out.data(), test_size);
 
-    qfns.quantize_row_q_reference(test_data, tmp_q.data(), test_size);
-    qfns.dequantize_row_q(tmp_q.data(), tmp_out_ref.data(), test_size);
+    qfns.from_float_reference(test_data, tmp_q.data(), test_size);
+    qfns.to_float(tmp_q.data(), tmp_out_ref.data(), test_size);
 
     return array_rmse(tmp_out.data(), tmp_out_ref.data(), test_size);
 }
@@ -73,15 +73,17 @@ float dot_product(const float * a1, const float * a2, size_t test_size) {
 }
 
 // Total dot product error
-float dot_product_error(quantize_fns_t & qfns, size_t test_size, const float * test_data1, const float *test_data2) {
+float dot_product_error(ggml_type_traits_t & qfns, size_t test_size, const float * test_data1, const float *test_data2) {
     std::vector<uint8_t> tmp_q1(2*test_size);
     std::vector<uint8_t> tmp_q2(2*test_size);
 
-    qfns.quantize_row_q    (test_data1, tmp_q1.data(), test_size);
-    qfns.quantize_row_q_dot(test_data2, tmp_q2.data(), test_size);
+    auto vdot = ggml_internal_get_type_traits(qfns.vec_dot_type);
+
+    qfns.from_float(test_data1, tmp_q1.data(), test_size);
+    vdot.from_float(test_data2, tmp_q2.data(), test_size);
 
     float result = INFINITY;
-    qfns.vec_dot_q(test_size, &result, tmp_q1.data(), tmp_q2.data());
+    qfns.vec_dot(test_size, &result, tmp_q1.data(), tmp_q2.data());
 
     const float dot_ref = dot_product(test_data1, test_data2, test_size);
 
@@ -123,9 +125,9 @@ int main(int argc, char * argv[]) {
 
     for (int i = 0; i < GGML_TYPE_COUNT; i++) {
         ggml_type type = (ggml_type) i;
-        quantize_fns_t qfns = ggml_internal_get_quantize_fn(i);
+        ggml_type_traits_t qfns = ggml_internal_get_type_traits(type);
 
-        if (qfns.quantize_row_q && qfns.dequantize_row_q) {
+        if (qfns.from_float && qfns.to_float) {
             const float total_error = total_quantization_error(qfns, test_size, test_data.data());
             const float max_quantization_error =
                 type == GGML_TYPE_Q2_K ? MAX_QUANTIZATION_TOTAL_ERROR_2BITS :
diff --git a/tests/test-quantize-perf.cpp b/tests/test-quantize-perf.cpp
index c0e361e92..0bb9537f6 100644
--- a/tests/test-quantize-perf.cpp
+++ b/tests/test-quantize-perf.cpp
@@ -123,9 +123,9 @@ void usage(char * argv[]) {
     printf("  --type TYPE           set test type as");
     for (int i = 0; i < GGML_TYPE_COUNT; i++) {
         ggml_type type = (ggml_type) i;
-        quantize_fns_t qfns = ggml_internal_get_quantize_fn(type);
+        ggml_type_traits_t qfns = ggml_internal_get_type_traits(type);
         if (ggml_type_name(type) != NULL) {
-            if (qfns.quantize_row_q && qfns.dequantize_row_q) {
+            if (qfns.from_float && qfns.to_float) {
                 printf(" %s", ggml_type_name(type));
             }
         }
@@ -271,12 +271,12 @@ int main(int argc, char * argv[]) {
 
     for (int i = 0; i < GGML_TYPE_COUNT; i++) {
         ggml_type type = (ggml_type) i;
-        quantize_fns_t qfns = ggml_internal_get_quantize_fn(i);
+        ggml_type_traits_t qfns = ggml_internal_get_type_traits(type);
         if (!params.include_types.empty() && ggml_type_name(type) && std::find(params.include_types.begin(), params.include_types.end(), ggml_type_name(type)) == params.include_types.end()) {
             continue;
         }
 
-        if (qfns.quantize_row_q && qfns.dequantize_row_q) {
+        if (qfns.from_float && qfns.to_float) {
             printf("%s\n", ggml_type_name(type));
 
             if (params.op_quantize_row_q_reference) {
@@ -284,7 +284,7 @@ int main(int argc, char * argv[]) {
                 for (size_t size : params.test_sizes) {
                     printf("    %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024));
                     auto quantize_fn = [&](void ) {
-                        qfns.quantize_row_q_reference(test_data1, test_q1, size);
+                        qfns.from_float_reference(test_data1, test_q1, size);
                         return test_q1[0];
                     };
                     size_t quantized_size = size / ggml_blck_size(type) * ggml_type_size(type);
@@ -298,7 +298,7 @@ int main(int argc, char * argv[]) {
                 for (size_t size : params.test_sizes) {
                     printf("    %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024));
                     auto quantize_fn = [&](void ) {
-                        qfns.quantize_row_q(test_data1, test_q1, size);
+                        qfns.from_float(test_data1, test_q1, size);
                         return test_q1[0];
                     };
                     size_t quantized_size = size / ggml_blck_size(type) * ggml_type_size(type);
@@ -309,11 +309,11 @@ int main(int argc, char * argv[]) {
 
             if (params.op_dequantize_row_q) {
                 printf("  dequantize_row_q\n");
-                qfns.quantize_row_q(test_data1, test_q1, largest);
+                qfns.from_float(test_data1, test_q1, largest);
                 for (size_t size : params.test_sizes) {
                     printf("    %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024));
                     auto quantize_fn = [&](void ) {
-                        qfns.dequantize_row_q(test_q1, test_out, size);
+                        qfns.to_float(test_q1, test_out, size);
                         return test_out[0];
                     };
                     size_t quantized_size = size / ggml_blck_size(type) * ggml_type_size(type);
@@ -327,7 +327,8 @@ int main(int argc, char * argv[]) {
                 for (size_t size : params.test_sizes) {
                     printf("    %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024));
                     auto quantize_fn = [&](void ) {
-                        qfns.quantize_row_q_dot(test_data1, test_q1, size);
+                        auto vdot = ggml_internal_get_type_traits(qfns.vec_dot_type);
+                        vdot.from_float(test_data1, test_q1, size);
                         return test_q1[0];
                     };
                     size_t quantized_size = size / ggml_blck_size(type) * ggml_type_size(type);
@@ -338,13 +339,13 @@ int main(int argc, char * argv[]) {
 
             if (params.op_vec_dot_q) {
                 printf("  vec_dot_q\n");
-                qfns.quantize_row_q(test_data1, test_q1, largest);
-                qfns.quantize_row_q(test_data2, test_q2, largest);
+                qfns.from_float(test_data1, test_q1, largest);
+                qfns.from_float(test_data2, test_q2, largest);
                 for (size_t size : params.test_sizes) {
                     printf("    %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024));
                     auto quantize_fn = [&](void ) {
                         float result;
-                        qfns.vec_dot_q(size, &result, test_q1, test_q2);
+                        qfns.vec_dot(size, &result, test_q1, test_q2);
                         return result;
                     };
                     size_t quantized_size = size / ggml_blck_size(type) * ggml_type_size(type);
diff --git a/tests/test-tokenizer-0.cpp b/tests/test-tokenizer-0.cpp
index 20abe7100..87fde1645 100644
--- a/tests/test-tokenizer-0.cpp
+++ b/tests/test-tokenizer-0.cpp
@@ -31,6 +31,8 @@ int main(int argc, char **argv) {
     llama_model * model;
     llama_context * ctx;
 
+    llama_backend_init(false);
+
     // load the vocab
     {
         auto lparams = llama_context_default_params();
@@ -97,5 +99,7 @@ int main(int argc, char **argv) {
     llama_free_model(model);
     llama_free(ctx);
 
+    llama_backend_free();
+
     return 0;
 }